Indexer.php 81.1 KB
Newer Older
1
<?php
2

3
/*
4
 * This file is part of the TYPO3 CMS project.
5
 *
6
7
8
 * It is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License, either version 2
 * of the License, or any later version.
9
 *
10
11
 * For the full copyright and license information, please read the
 * LICENSE.txt file that was distributed with this source code.
12
 *
13
14
 * The TYPO3 project - inspiring people to share!
 */
15

16
17
namespace TYPO3\CMS\IndexedSearch;

18
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
19
use TYPO3\CMS\Core\Core\Environment;
20
use TYPO3\CMS\Core\Database\Connection;
21
use TYPO3\CMS\Core\Database\ConnectionPool;
22
use TYPO3\CMS\Core\Html\HtmlParser;
23
use TYPO3\CMS\Core\Http\RequestFactory;
24
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
25
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
26
use TYPO3\CMS\Core\Utility\GeneralUtility;
27
use TYPO3\CMS\Core\Utility\MathUtility;
28
use TYPO3\CMS\Core\Utility\PathUtility;
29
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
30
use TYPO3\CMS\IndexedSearch\Hook\CrawlerFilesHook;
31
use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
32

33
34
35
/**
 * Indexing class for TYPO3 frontend
 */
36
37
class Indexer
{
38

39
40
41
    /**
     * @var array
     */
42
    public $reasons = [
43
44
45
46
47
48
        -1 => 'mtime matched the document, so no changes detected and no content updated',
        -2 => 'The minimum age was not exceeded',
        1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
        2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
        3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
        4 => 'Page has never been indexed (is not represented in the index_phash table).'
49
    ];
50
51
52
53
54
55
56
57
58
59
60
61
62

    /**
     * HTML code blocks to exclude from indexing
     *
     * @var string
     */
    public $excludeSections = 'script,style';

    /**
     * Supported Extensions for external files
     *
     * @var array
     */
63
    public $external_parsers = [];
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

    /**
     * External parser objects, keys are file extension names. Values are objects with certain methods.
     * Fe-group list (pages might be indexed separately for each usergroup combination to support search
     * in access limited pages!)
     *
     * @var string
     */
    public $defaultGrList = '0,-1';

    /**
     * Min/Max times
     *
     * @var int
     */
    public $tstamp_maxAge = 0;

    /**
     * If set, this tells a number of seconds that is the maximum age of an indexed document.
     * Regardless of mtime the document will be re-indexed if this limit is exceeded.
     *
     * @var int
     */
    public $tstamp_minAge = 0;

    /**
     * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
     *
     * @var int
     */
    public $maxExternalFiles = 0;

    /**
     * Max number of external files to index.
     *
     * @var bool
     */
    public $forceIndexing = false;

    /**
     * Set when crawler is detected (internal)
     *
     * @var array
     */
108
    public $defaultContentArray = [
109
110
111
112
        'title' => '',
        'description' => '',
        'keywords' => '',
        'body' => ''
113
    ];
114
115
116
117
118
119
120
121
122
123
124
125
126
127

    /**
     * @var int
     */
    public $wordcount = 0;

    /**
     * @var int
     */
    public $externalFileCounter = 0;

    /**
     * @var array
     */
128
    public $conf = [];
129
130
131
132
133
134

    /**
     * Configuration set internally (see init functions for required keys and their meaning)
     *
     * @var array
     */
135
    public $indexerConfig = [];
136
137

    /**
138
     * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
139
140
141
     *
     * @var array
     */
142
    public $hash = [];
143
144
145
146
147
148

    /**
     * Hash array, contains phash and phash_grouping
     *
     * @var array
     */
149
    public $file_phash_arr = [];
150
151
152
153
154
155

    /**
     * Hash array for files
     *
     * @var array
     */
156
    public $contentParts = [];
157
158
159
160
161
162
163
164
165
166
167

    /**
     * Content of TYPO3 page
     *
     * @var string
     */
    public $content_md5h = '';

    /**
     * @var array
     */
168
    public $internal_log = [];
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220

    /**
     * Internal log
     *
     * @var string
     */
    public $indexExternalUrl_content = '';

    /**
     * @var int
     */
    public $freqRange = 32000;

    /**
     * @var float
     */
    public $freqMax = 0.1;

    /**
     * @var bool
     */
    public $enableMetaphoneSearch = false;

    /**
     * @var bool
     */
    public $storeMetaphoneInfoAsWords;

    /**
     * @var string
     */
    public $metaphoneContent = '';

    /**
     * Metaphone object, if any
     *
     * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
     */
    public $metaphoneObj;

    /**
     * Lexer object for word splitting
     *
     * @var \TYPO3\CMS\IndexedSearch\Lexer
     */
    public $lexerObj;

    /**
     * @var bool
     */
    public $flagBitMask;

221
222
223
224
225
226
227
228
229
230
231
    /**
     * @var TimeTracker
     */
    protected $timeTracker;

    /**
     * Indexer constructor.
     */
    public function __construct()
    {
        $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
232
233
234
235
236
237
238
239
240
        // Indexer configuration from Extension Manager interface
        $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
        $this->tstamp_minAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
        $this->tstamp_maxAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
        $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
        $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
        // Workaround: If the extension configuration was not updated yet, the value is not existing
        $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
        $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
241
242
243
244
245
246
247
    }

    /********************************
     *
     * Initialization
     *
     *******************************/
248

249
    /**
250
251
     * Initializes the object.
     * @param array|null $configuration will be used to set $this->conf, otherwise $this->conf MUST be set with proper values prior to this call
252
     */
253
    public function init(array $configuration = null)
254
    {
255
256
257
        if (is_array($configuration)) {
            $this->conf = $configuration;
        }
258
259
260
261
262
263
264
265
        // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
        $this->setT3Hashes();
        // Initialize external document parsers:
        // Example configuration, see ext_localconf.php of this file!
        if ($this->conf['index_externals']) {
            $this->initializeExternalParsers();
        }
        // Initialize lexer (class that deconstructs the text into words):
266
267
        $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
        $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
268
269
270
271
        $this->lexerObj->debug = $this->indexerConfig['debugMode'];
        // Initialize metaphone hook:
        // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
        if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
272
            $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
273
274
275
276
277
278
279
            $this->metaphoneObj->pObj = $this;
        }
    }

    /**
     * Initialize external parsers
     *
280
     * @internal
281
282
283
284
     * @see init()
     */
    public function initializeExternalParsers()
    {
285
286
287
288
289
290
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
            $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
            $this->external_parsers[$extension]->pObj = $this;
            // Init parser and if it returns FALSE, unset its entry again:
            if (!$this->external_parsers[$extension]->initParser($extension)) {
                unset($this->external_parsers[$extension]);
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
            }
        }
    }

    /********************************
     *
     * Indexing; TYPO3 pages (HTML content)
     *
     *******************************/
    /**
     * Start indexing of the TYPO3 page
     */
    public function indexTypo3PageContent()
    {
        $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
        $is_grlist = $this->is_grlist_set($this->hash['phash']);
        if ($check > 0 || !$is_grlist || $this->forceIndexing) {
            // Setting message:
            if ($this->forceIndexing) {
                $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
            } elseif ($check > 0) {
                $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
            } else {
                $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
            }
            // Divide into title,keywords,description and body:
            $this->log_push('Split content', '');
            $this->contentParts = $this->splitHTMLContent($this->conf['content']);
            if ($this->conf['indexedDocTitle']) {
                $this->contentParts['title'] = $this->conf['indexedDocTitle'];
            }
            $this->log_pull();
323
            // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
324
325
326
327
328
329
            $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
            // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
            // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
            // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
            $checkCHash = $this->checkContentHash();
            if (!is_array($checkCHash) || $check === 1) {
330
                $Pstart = IndexedSearchUtility::milliseconds();
331
332
333
334
335
336
337
                $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
                $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
                $this->log_pull();
                // Splitting words
                $this->log_push('Extract words from content', '');
                $splitInWords = $this->processWordsInArrays($this->contentParts);
                $this->log_pull();
338
339
                // Analyze the indexed words.
                $this->log_push('Analyze the extracted words', '');
340
341
342
343
344
345
346
347
348
349
350
351
352
353
                $indexArr = $this->indexAnalyze($splitInWords);
                $this->log_pull();
                // Submitting page (phash) record
                $this->log_push('Submitting page', '');
                $this->submitPage();
                $this->log_pull();
                // Check words and submit to word list if not there
                $this->log_push('Check word list and submit words', '');
                if (IndexedSearchUtility::isTableUsed('index_words')) {
                    $this->checkWordList($indexArr);
                    $this->submitWords($indexArr, $this->hash['phash']);
                }
                $this->log_pull();
                // Set parsetime
354
                $this->updateParsetime($this->hash['phash'], IndexedSearchUtility::milliseconds() - $Pstart);
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
                // Checking external files if configured for.
                $this->log_push('Checking external files', '');
                if ($this->conf['index_externals']) {
                    $this->extractLinks($this->conf['content']);
                }
                $this->log_pull();
            } else {
                // Update the timestamp
                $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
                $this->updateSetId($this->hash['phash']);
                // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
                $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
                $this->updateRootline();
                $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
            }
        } else {
            $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
        }
    }

    /**
     * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
     *
     * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
     * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
     * @see splitRegularContent()
     */
    public function splitHTMLContent($content)
    {
        // divide head from body ( u-ouh :) )
        $contentArr = $this->defaultContentArray;
        $contentArr['body'] = stristr($content, '<body');
        $headPart = substr($content, 0, -strlen($contentArr['body']));
        // get title
        $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
        $titleParts = explode(':', $contentArr['title'], 2);
391
        $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
392
393
        // get keywords and description metatags
        if ($this->conf['index_metatags']) {
394
            $meta = [];
395
396
397
398
399
400
            $i = 0;
            while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
                $i++;
            }
            // @todo The code below stops at first unset tag. Is that correct?
            for ($i = 0; isset($meta[$i]); $i++) {
401
402
                // decode HTML entities, meta tag content needs to be encoded later
                $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i], true);
403
                if (stripos($meta[$i]['name'], 'keywords') !== false) {
404
405
                    $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
                }
406
                if (stripos($meta[$i]['name'], 'description') !== false) {
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
                    $contentArr['description'] .= ',' . $meta[$i]['content'];
                }
            }
        }
        // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
        $this->typoSearchTags($contentArr['body']);
        // Get rid of unwanted sections (ie. scripting and style stuff) in body
        $tagList = explode(',', $this->excludeSections);
        foreach ($tagList as $tag) {
            while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
            }
        }
        // remove tags, but first make sure we don't concatenate words by doing it
        $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
        $contentArr['body'] = trim(strip_tags($contentArr['body']));
        $contentArr['keywords'] = trim($contentArr['keywords']);
        $contentArr['description'] = trim($contentArr['description']);
        // Return array
        return $contentArr;
    }

    /**
     * Extract the charset value from HTML meta tag.
     *
     * @param string $content HTML content
     * @return string The charset value if found.
     */
    public function getHTMLcharset($content)
    {
        if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
            if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
                return $reg2[1];
            }
        }
441
442

        return '';
443
444
445
446
447
448
449
450
451
452
453
454
455
    }

    /**
     * Converts a HTML document to utf-8
     *
     * @param string $content HTML content, any charset
     * @param string $charset Optional charset (otherwise extracted from HTML)
     * @return string Converted HTML
     */
    public function convertHTMLToUtf8($content, $charset = '')
    {
        // Find charset:
        $charset = $charset ?: $this->getHTMLcharset($content);
456
        $charset = trim(strtolower($charset));
457
458
        // Convert charset:
        if ($charset && $charset !== 'utf-8') {
459
            $content = mb_convert_encoding($content, 'utf-8', $charset);
460
461
        }
        // Convert entities, assuming document is now UTF-8:
462
        return html_entity_decode($content);
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
    }

    /**
     * Finds first occurrence of embracing tags and returns the embraced content and the original string with
     * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
     * <title> of document or removing <script>-sections
     *
     * @param string $string String to search in
     * @param string $tagName Tag name, eg. "script
     * @param string $tagContent Passed by reference: Content inside found tag
     * @param string $stringAfter Passed by reference: Content after found tag
     * @param string $paramList Passed by reference: Attributes of the found tag.
     * @return bool Returns FALSE if tag was not found, otherwise TRUE.
     */
    public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
    {
        $endTag = '</' . $tagName . '>';
        $startTag = '<' . $tagName;
        // stristr used because we want a case-insensitive search for the tag.
        $isTagInText = stristr($string, $startTag);
        // if the tag was not found, return FALSE
        if (!$isTagInText) {
            return false;
        }
487
        [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2);
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
        $afterTagInText = stristr($isTagInText, $endTag);
        if ($afterTagInText) {
            $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
            $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
            $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
        } else {
            $tagContent = '';
            $stringAfter = $isTagInText;
        }
        return true;
    }

    /**
     * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
     *
     * @param string $body HTML Content, passed by reference
     * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
     */
    public function typoSearchTags(&$body)
    {
        $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
        if (count($expBody) > 1) {
            $body = '';
511
            $prev = '';
512
513
            foreach ($expBody as $val) {
                $part = explode('-->', $val, 2);
514
                if (trim($part[0]) === 'begin') {
515
516
                    $body .= $part[1];
                    $prev = '';
517
                } elseif (trim($part[0]) === 'end') {
518
519
520
521
522
523
524
                    $body .= $prev;
                } else {
                    $prev = $val;
                }
            }
            return true;
        }
525
        return false;
526
527
528
529
530
531
532
533
534
    }

    /**
     * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
     *
     * @param string $content HTML content
     */
    public function extractLinks($content)
    {
535
        $crawler = null;
536
537
        // Get links:
        $list = $this->extractHyperLinks($content);
538
        if ($this->indexerConfig['useCrawlerForExternalFiles'] && ExtensionManagementUtility::isLoaded('crawler')) {
539
540
541
542
543
            /**
             * todo: remove dependency to class tx_crawler_lib
             * @link https://forge.typo3.org/issues/83603
             */
            $crawler = GeneralUtility::makeInstance('tx_crawler_lib');
544
545
546
547
548
549
550
551
552
553
554
555
556
        }
        // Traverse links:
        foreach ($list as $linkInfo) {
            // Decode entities:
            if ($linkInfo['localPath']) {
                // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
                $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
            } else {
                $linkSource = htmlspecialchars_decode($linkInfo['href']);
            }
            // Parse URL:
            $qParts = parse_url($linkSource);
            // Check for jumpurl (TYPO3 specific thing...)
557
            if ($qParts['query'] && strpos($qParts['query'], 'jumpurl=') !== false) {
558
559
560
561
562
563
564
565
566
567
568
569
570
571
                parse_str($qParts['query'], $getP);
                $linkSource = $getP['jumpurl'];
                $qParts = parse_url($linkSource);
            }
            if (!$linkInfo['localPath'] && $qParts['scheme']) {
                if ($this->indexerConfig['indexExternalURLs']) {
                    // Index external URL (http or otherwise)
                    $this->indexExternalUrl($linkSource);
                }
            } elseif (!$qParts['query']) {
                $linkSource = urldecode($linkSource);
                if (GeneralUtility::isAllowedAbsPath($linkSource)) {
                    $localFile = $linkSource;
                } else {
572
                    $localFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $linkSource);
573
574
575
576
577
578
579
                }
                if ($localFile && @is_file($localFile)) {
                    // Index local file:
                    if ($linkInfo['localPath']) {
                        $fI = pathinfo($linkSource);
                        $ext = strtolower($fI['extension']);
                        if (is_object($crawler)) {
580
                            $params = [
581
582
583
                                'document' => $linkSource,
                                'alturl' => $linkInfo['href'],
                                'conf' => $this->conf
584
                            ];
585
                            unset($params['conf']['content']);
586
                            $crawler->addQueueEntry_callBack(0, $params, CrawlerFilesHook::class, $this->conf['id']);
587
588
589
590
591
592
                            $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
                        } else {
                            $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
                        }
                    } else {
                        if (is_object($crawler)) {
593
                            $params = [
594
595
                                'document' => $linkSource,
                                'conf' => $this->conf
596
                            ];
597
                            unset($params['conf']['content']);
598
                            $crawler->addQueueEntry_callBack(0, $params, CrawlerFilesHook::class, $this->conf['id']);
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
                            $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
                        } else {
                            $this->indexRegularDocument($linkSource);
                        }
                    }
                }
            }
        }
    }

    /**
     * Extracts all links to external documents from the HTML content string
     *
     * @param string $html
     * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
     * @see extractLinks()
     */
    public function extractHyperLinks($html)
    {
618
        $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
619
        $htmlParts = $htmlParser->splitTags('a', $html);
620
        $hyperLinksData = [];
621
622
623
624
625
        foreach ($htmlParts as $index => $tagData) {
            if ($index % 2 !== 0) {
                $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
                $firstTagName = $htmlParser->getFirstTagName($tagData);
                if (strtolower($firstTagName) === 'a') {
626
                    if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
627
                        $hyperLinksData[] = [
628
629
                            'tag' => $tagData,
                            'href' => $tagAttributes[0]['href'],
630
                            'localPath' => $this->createLocalPath(urldecode($tagAttributes[0]['href']))
631
                        ];
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
                    }
                }
            }
        }
        return $hyperLinksData;
    }

    /**
     * Extracts the "base href" from content string.
     *
     * @param string $html Content to analyze
     * @return string The base href or an empty string if not found
     */
    public function extractBaseHref($html)
    {
        $href = '';
648
        $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
        $htmlParts = $htmlParser->splitTags('base', $html);
        foreach ($htmlParts as $index => $tagData) {
            if ($index % 2 !== 0) {
                $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
                $firstTagName = $htmlParser->getFirstTagName($tagData);
                if (strtolower($firstTagName) === 'base') {
                    $href = $tagAttributes[0]['href'];
                    if ($href) {
                        break;
                    }
                }
            }
        }
        return $href;
    }

    /******************************************
     *
     * Indexing; external URL
     *
     ******************************************/
    /**
     * Index External URLs HTML content
     *
673
     * @param string $externalUrl URL, eg. "https://typo3.org/
674
675
676
677
678
679
     * @see indexRegularDocument()
     */
    public function indexExternalUrl($externalUrl)
    {
        // Get headers:
        $urlHeaders = $this->getUrlHeaders($externalUrl);
680
        if (is_array($urlHeaders) && stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
            $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
            if ((string)$content !== '') {
                // Create temporary file:
                $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
                if ($tmpFile) {
                    GeneralUtility::writeFile($tmpFile, $content);
                    // Index that file:
                    $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
                    // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
                    unlink($tmpFile);
                }
            }
        }
    }

    /**
     * Getting HTTP request headers of URL
     *
     * @param string $url The URL
     * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
     */
    public function getUrlHeaders($url)
    {
704
705
706
        try {
            $response = GeneralUtility::makeInstance(RequestFactory::class)->request($url, 'HEAD');
            $headers = $response->getHeaders();
707
            $retVal = [];
708
709
            foreach ($headers as $key => $value) {
                $retVal[$key] = implode('', $value);
710
711
            }
            return $retVal;
712
713
714
        } catch (\Exception $e) {
            // fail silently if the HTTP request failed
            return false;
715
716
717
718
719
720
721
722
723
724
725
726
        }
    }

    /**
     * Checks if the file is local
     *
     * @param string $sourcePath
     * @return string Absolute path to file if file is local, else empty string
     */
    protected function createLocalPath($sourcePath)
    {
        $localPath = '';
727
        $pathFunctions = [
728
729
730
731
            'createLocalPathUsingAbsRefPrefix',
            'createLocalPathUsingDomainURL',
            'createLocalPathFromAbsoluteURL',
            'createLocalPathFromRelativeURL'
732
        ];
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
        foreach ($pathFunctions as $functionName) {
            $localPath = $this->{$functionName}($sourcePath);
            if ($localPath != '') {
                break;
            }
        }
        return $localPath;
    }

    /**
     * Attempts to create a local file path by matching a current request URL.
     *
     * @param string $sourcePath
     * @return string
     */
    protected function createLocalPathUsingDomainURL($sourcePath)
    {
        $localPath = '';
        $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
        $baseURLLength = strlen($baseURL);
753
        if (strpos($sourcePath, $baseURL) === 0) {
754
            $sourcePath = substr($sourcePath, $baseURLLength);
755
            $localPath = Environment::getPublicPath() . '/' . $sourcePath;
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
            if (!self::isAllowedLocalFile($localPath)) {
                $localPath = '';
            }
        }
        return $localPath;
    }

    /**
     * Attempts to create a local file path by matching absRefPrefix. This
     * requires TSFE. If TSFE is missing, this function does nothing.
     *
     * @param string $sourcePath
     * @return string
     */
    protected function createLocalPathUsingAbsRefPrefix($sourcePath)
    {
        $localPath = '';
773
        if (isset($GLOBALS['TSFE']) && $GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
774
775
            $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
            $absRefPrefixLength = strlen($absRefPrefix);
776
            if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) {
777
                $sourcePath = substr($sourcePath, $absRefPrefixLength);
778
                $localPath = Environment::getPublicPath() . '/' . $sourcePath;
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
                if (!self::isAllowedLocalFile($localPath)) {
                    $localPath = '';
                }
            }
        }
        return $localPath;
    }

    /**
     * Attempts to create a local file path from the absolute URL without
     * schema.
     *
     * @param string $sourcePath
     * @return string
     */
    protected function createLocalPathFromAbsoluteURL($sourcePath)
    {
        $localPath = '';
797
        if ($sourcePath[0] === '/') {
798
            $sourcePath = substr($sourcePath, 1);
799
            $localPath = Environment::getPublicPath() . '/' . $sourcePath;
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
            if (!self::isAllowedLocalFile($localPath)) {
                $localPath = '';
            }
        }
        return $localPath;
    }

    /**
     * Attempts to create a local file path from the relative URL.
     *
     * @param string $sourcePath
     * @return string
     */
    protected function createLocalPathFromRelativeURL($sourcePath)
    {
        $localPath = '';
        if (self::isRelativeURL($sourcePath)) {
817
            $localPath = Environment::getPublicPath() . '/' . $sourcePath;
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
            if (!self::isAllowedLocalFile($localPath)) {
                $localPath = '';
            }
        }
        return $localPath;
    }

    /**
     * Checks if URL is relative.
     *
     * @param string $url
     * @return bool
     */
    protected static function isRelativeURL($url)
    {
        $urlParts = @parse_url($url);
834
        return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/';
835
836
837
838
839
840
841
842
843
844
845
    }

    /**
     * Checks if the path points to the file inside the web site
     *
     * @param string $filePath
     * @return bool
     */
    protected static function isAllowedLocalFile($filePath)
    {
        $filePath = GeneralUtility::resolveBackPath($filePath);
846
        $insideWebPath = strpos($filePath, Environment::getPublicPath()) === 0;
847
848
849
850
851
852
853
854
855
856
        $isFile = is_file($filePath);
        return $insideWebPath && $isFile;
    }

    /******************************************
     *
     * Indexing; external files (PDF, DOC, etc)
     *
     ******************************************/
    /**
857
     * Indexing a regular document given as $file (relative to public web path, local file)
858
     *
859
     * @param string $file Relative Filename, relative to public web path. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
860
861
862
863
864
865
866
867
868
869
870
871
     * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
     * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
     * @param string $altExtension File extension for temporary file.
     */
    public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
    {
        // Init
        $fI = pathinfo($file);
        $ext = $altExtension ?: strtolower($fI['extension']);
        // Create abs-path:
        if (!$contentTmpFile) {
            if (!GeneralUtility::isAbsPath($file)) {
872
873
                // Relative, prepend public web path:
                $absFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $file);
874
875
876
877
878
879
880
881
882
883
884
885
886
887
            } else {
                // Absolute, pass-through:
                $absFile = $file;
            }
            $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
        } else {
            $absFile = $contentTmpFile;
        }
        // Indexing the document:
        if ($absFile && @is_file($absFile)) {
            if ($this->external_parsers[$ext]) {
                $fileInfo = stat($absFile);
                $cParts = $this->fileContentParts($ext, $absFile);
                foreach ($cParts as $cPKey) {
888
                    $this->internal_log = [];
889
                    $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
890
                    $Pstart = IndexedSearchUtility::milliseconds();
891
                    $subinfo = ['key' => $cPKey];
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
                    // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
                    $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
                    $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
                    if ($check > 0 || $force) {
                        if ($check > 0) {
                            $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
                        } else {
                            $this->log_setTSlogMessage('Indexing forced by flag', 1);
                        }
                        // Check external file counter:
                        if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
                            // Divide into title,keywords,description and body:
                            $this->log_push('Split content', '');
                            $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
                            $this->log_pull();
                            if (is_array($contentParts)) {
                                // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
909
                                $content_md5h = IndexedSearchUtility::md5inthash(implode('', $contentParts));
910
911
912
913
914
915
916
                                if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
                                    // Increment counter:
                                    $this->externalFileCounter++;
                                    // Splitting words
                                    $this->log_push('Extract words from content', '');
                                    $splitInWords = $this->processWordsInArrays($contentParts);
                                    $this->log_pull();
917
918
                                    // Analyze the indexed words.
                                    $this->log_push('Analyze the extracted words', '');
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
                                    $indexArr = $this->indexAnalyze($splitInWords);
                                    $this->log_pull();
                                    // Submitting page (phash) record
                                    $this->log_push('Submitting page', '');
                                    // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
                                    $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
                                    $this->log_pull();
                                    // Check words and submit to word list if not there
                                    $this->log_push('Check word list and submit words', '');
                                    if (IndexedSearchUtility::isTableUsed('index_words')) {
                                        $this->checkWordList($indexArr);
                                        $this->submitWords($indexArr, $phash_arr['phash']);
                                    }
                                    $this->log_pull();
                                    // Set parsetime
934
                                    $this->updateParsetime($phash_arr['phash'], IndexedSearchUtility::milliseconds() - $Pstart);
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
                                } else {
                                    // Update the timestamp
                                    $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
                                    $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
                                }
                            } else {
                                $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
                            }
                        } else {
                            $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
                        }
                    } else {
                        $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
                    }
                    // Checking and setting sections:
                    $this->submitFile_section($phash_arr['phash']);
                    // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
                    $this->log_pull();
                }
            } else {
                $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
            }
        } else {
            $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
        }
    }

    /**
     * Reads the content of an external file being indexed.
     * The content from the external parser MUST be returned in utf-8!
     *
     * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
     * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
     * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
     * @return array Standard content array (title, description, keywords, body keys)
     */
    public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
    {
        $contentArray = null;
        // Consult relevant external document parser:
        if (is_object($this->external_parsers[$fileExtension])) {
            $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
        }
        return $contentArray;
    }

    /**
     * Creates an array with pointers to divisions of document.
     *
     * @param string $ext File extension
     * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
     * @return array Array of pointers to sections that the document should be divided into
     */
    public function fileContentParts($ext, $absFile)
    {
990
        $cParts = [0];
991
992
993
994
995
996
997
998
999
1000
        // Consult relevant external document parser:
        if (is_object($this->external_parsers[$ext])) {
            $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
        }
        return $cParts;
    }

    /**
     * Splits non-HTML content (from external files for instance)
     *
For faster browsing, not all history is shown. View entire blame