[TASK] Separate configuration into hooks for Indexer 39/64039/6
authorBenni Mack <benni@typo3.org>
Wed, 1 Apr 2020 12:07:22 +0000 (14:07 +0200)
committerBenni Mack <benni@typo3.org>
Fri, 3 Apr 2020 14:33:07 +0000 (16:33 +0200)
This is a pre-patch to separate crawler-specific implementation from
the actual indexing configuration.

This first patch unifies the configuration creation for the indexer,
and places the configuration in each respective logic (see TSFE hook),
where as the Indexer does not know about TSFE or "called from Backend"
anymore, making the indexer just doing its work.

The next iteration will then separate all indexing logic into smaller
chunks again.

Resolves: #90920
Releases: master
Change-Id: I631ddd0839ee8f03d0a6c3c86d302b935e599a5e
Reviewed-on: https://review.typo3.org/c/Packages/TYPO3.CMS/+/64039
Tested-by: TYPO3com <noreply@typo3.com>
Tested-by: Andreas Fernandez <a.fernandez@scripting-base.de>
Tested-by: Susanne Moog <look@susi.dev>
Tested-by: Benni Mack <benni@typo3.org>
Reviewed-by: Andreas Fernandez <a.fernandez@scripting-base.de>
Reviewed-by: Susanne Moog <look@susi.dev>
Reviewed-by: Benni Mack <benni@typo3.org>
typo3/sysext/indexed_search/Classes/Hook/CrawlerFilesHook.php
typo3/sysext/indexed_search/Classes/Hook/CrawlerHook.php
typo3/sysext/indexed_search/Classes/Hook/TypoScriptFrontendHook.php
typo3/sysext/indexed_search/Classes/Indexer.php
typo3/sysext/indexed_search/Tests/Unit/IndexerTest.php
typo3/sysext/indexed_search/ext_localconf.php

index 3b762dd..450da5e 100644 (file)
@@ -14,6 +14,9 @@ namespace TYPO3\CMS\IndexedSearch\Hook;
  * The TYPO3 project - inspiring people to share!
  */
 
+use TYPO3\CMS\Core\Utility\GeneralUtility;
+use TYPO3\CMS\IndexedSearch\Indexer;
+
 /**
  * Crawler hook for indexed search. Works with the "crawler" extension
  * This hook is specifically used to index external files found on pages through the crawler extension.
@@ -34,10 +37,8 @@ class CrawlerFilesHook
         if (!is_array($params['conf'])) {
             return;
         }
-        // Initialize the indexer class:
-        $indexerObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
-        $indexerObj->conf = $params['conf'];
-        $indexerObj->init();
+        $indexerObj = GeneralUtility::makeInstance(Indexer::class);
+        $indexerObj->init($params['conf']);
         // Index document:
         if ($params['alturl']) {
             $fI = pathinfo($params['document']);
index eef8b84..a08c15f 100644 (file)
@@ -24,6 +24,7 @@ use TYPO3\CMS\Core\Exception\Page\RootLineException;
 use TYPO3\CMS\Core\Utility\GeneralUtility;
 use TYPO3\CMS\Core\Utility\MathUtility;
 use TYPO3\CMS\Core\Utility\RootlineUtility;
+use TYPO3\CMS\IndexedSearch\Indexer;
 
 /**
  * Crawler hook for indexed search. Works with the "crawler" extension
@@ -339,9 +340,7 @@ class CrawlerHook
                 // Get root line (need to provide this when indexing external files)
                 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
                 // (Re)-Indexing file on page.
-                $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
-                $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
-                $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
+                $indexerObj = $this->initializeIndexer($cfgRec['pid'], 0, 0, '', $rl, $cfgRec['uid'], $cfgRec['set_id']);
                 $indexerObj->hash['phash'] = -1;
                 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
                 // Index document:
@@ -613,9 +612,7 @@ class CrawlerHook
     public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
     {
         // Index external URL:
-        $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
-        $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
-        $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
+        $indexerObj = $this->initializeIndexer($pageId, 0, 0, '', $rl, [], $cfgUid, $setId);
         $indexerObj->hash['phash'] = -1;
         // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
         $indexerObj->indexExternalUrl($url);
@@ -658,16 +655,13 @@ class CrawlerHook
      */
     public function indexSingleRecord($r, $cfgRec, $rl = null)
     {
-        // Init:
         $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
         $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
         $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
         $sys_language_uid = $languageField ? $r[$languageField] : 0;
-        // (Re)-Indexing a row from a table:
-        $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
         parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
-        $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams);
-        $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
+        // (Re)-Indexing a row from a table
+        $indexerObj = $this->initializeIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['uid'], $cfgRec['set_id']);
         $indexerObj->forceIndexing = true;
         $theContent = '';
         $theTitle = '';
@@ -678,8 +672,15 @@ class CrawlerHook
                 $theContent .= $r[$v] . ' ';
             }
         }
-        // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
-        $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
+        // Indexing the record as a page (but with parameters set)
+        $this->indexAsTYPO3Page(
+            $indexerObj,
+            strip_tags(str_replace('<', ' <', $theTitle)),
+            strip_tags(str_replace('<', ' <', $theContent)),
+            $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
+            $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
+            $r['uid']
+        );
     }
 
     /**
@@ -910,4 +911,91 @@ class CrawlerHook
             }
         }
     }
+
+    /**
+     * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
+     *
+     * @param int $id The page uid, &id=
+     * @param int $type The page type, &type=
+     * @param int $sys_language_uid sys_language uid, typically &L=
+     * @param string $MP The MP variable (Mount Points), &MP=
+     * @param array $uidRL Rootline array of only UIDs.
+     * @param array $queryArguments Array of GET variables to register with this indexing
+     * @param int $freeIndexUid Free index UID
+     * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
+     * @return Indexer
+     */
+    protected function initializeIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $queryArguments = [], $freeIndexUid = 0, $freeIndexSetId = 0): Indexer
+    {
+        $indexerObj = GeneralUtility::makeInstance(Indexer::class);
+        // Setting up internal configuration from config array:
+        // Information about page for which the indexing takes place
+        $configuration = [
+            // Page id (int)
+            'id' => $id,
+            // Page type (int)
+            'type' => $type,
+            // sys_language UID of the language of the indexing (int)
+            'sys_language_uid' => $sys_language_uid,
+            // MP variable, if any (Mount Points) (string)
+            'MP' => $MP,
+            // Group list (hardcoded for now...)
+            'gr_list' => '0,-1',
+            'staticPageArguments' => $queryArguments,
+            // Set to defaults
+            'freeIndexUid' => $freeIndexUid,
+            'freeIndexSetId' => $freeIndexSetId,
+            // Root line uids
+            'rootline_uids' => $uidRL,
+
+            // Configuration of behavior
+            // Whether to index external documents like PDF, DOC etc. (if possible)
+            'index_externals' => 1,
+            // Length of description text (max 250, default 200)
+            'index_descrLgd' => 200,
+            // Whether to index document keywords and description (if present)
+            'index_metatags' => true
+        ];
+        $indexerObj->init($configuration);
+        return $indexerObj;
+    }
+
+    /**
+     * Indexing records as the content of a TYPO3 page.
+     *
+     * @param Indexer $indexer
+     * @param string $title Title equivalent
+     * @param string $content The main content to index
+     * @param int $mtime Last modification time, in seconds
+     * @param int $crdate The creation date of the content, in seconds
+     * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
+     */
+    protected function indexAsTYPO3Page(Indexer $indexer, $title, $content, $mtime, $crdate = 0, $recordUid = 0)
+    {
+        // Content of page:
+        $indexer->conf['mtime'] = $mtime;
+        // Most recent modification time (seconds) of the content
+        $indexer->conf['crdate'] = $crdate;
+        // The creation date of the TYPO3 content
+        $indexer->conf['recordUid'] = $recordUid;
+        // UID of the record, if applicable
+        // Construct fake HTML for parsing:
+        $indexer->conf['content'] = '
+               <html>
+                       <head>
+                               <title>' . htmlspecialchars($title) . '</title>
+                       </head>
+                       <body>
+                               ' . htmlspecialchars($content) . '
+                       </body>
+               </html>';
+        // Content string (HTML of TYPO3 page)
+        // Initializing charset:
+        $indexer->conf['metaCharset'] = 'utf-8';
+        // Character set of content (will be converted to utf-8 during indexing)
+        $indexer->conf['indexedDocTitle'] = '';
+        // Alternative title for indexing
+        // Index content as if it was a TYPO3 page:
+        $indexer->indexTypo3PageContent();
+    }
 }
index 7612d00..3131655 100644 (file)
@@ -14,7 +14,15 @@ namespace TYPO3\CMS\IndexedSearch\Hook;
  * The TYPO3 project - inspiring people to share!
  */
 
+use TYPO3\CMS\Core\Charset\CharsetConverter;
+use TYPO3\CMS\Core\Charset\UnknownCharsetException;
+use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
+use TYPO3\CMS\Core\Context\Context;
+use TYPO3\CMS\Core\Context\LanguageAspect;
+use TYPO3\CMS\Core\TimeTracker\TimeTracker;
+use TYPO3\CMS\Core\Utility\GeneralUtility;
 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
+use TYPO3\CMS\IndexedSearch\Indexer;
 
 /**
  * Hooks for \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController (TSFE).
@@ -23,19 +31,140 @@ use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
 class TypoScriptFrontendHook
 {
     /**
-     * Frontend hook: If the page is not being re-generated this is our chance to force it to be (because re-generation of the page is required in order to have the indexer called!)
+     * Frontend hook: If the page is not being re-generated this is our chance to force it to be
+     * (because re-generation of the page is required in order to have the indexer called!)
      *
      * @param array $params Parameters from frontend
-     * @param TypoScriptFrontendController $ref TSFE object
+     * @param TypoScriptFrontendController $tsfe TSFE object
      */
-    public function headerNoCache(array &$params, $ref)
+    public function headerNoCache(array &$params, TypoScriptFrontendController $tsfe)
     {
         // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
-        if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $params['pObj']->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions'])) {
-            // Setting simple log entry:
-            $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: ' . $params['disableAcquireCacheData'];
+        if (in_array('tx_indexedsearch_reindex', $tsfe->applicationData['tx_crawler']['parameters']['procInstructions'] ?? [], true)) {
             // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached.
             $params['disableAcquireCacheData'] = true;
+            // Enable indexing
+            $tsfe->applicationData['forceIndexing'] = true;
         }
     }
+
+    /**
+     * Trigger indexing of content, after evaluating if this page could / should be indexed.
+     *
+     * @param TypoScriptFrontendController $tsfe
+     */
+    public function hook_indexContent(TypoScriptFrontendController $tsfe)
+    {
+        // Determine if page should be indexed, and if so, configure and initialize indexer
+        if (!$tsfe->config['config']['index_enable']) {
+            return;
+        }
+
+        // Indexer configuration from Extension Manager interface:
+        $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
+        $forceIndexing = $tsfe->applicationData['forceIndexing'] ?? false;
+
+        $timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
+        $timeTracker->push('Index page');
+        if ($disableFrontendIndexing && !$forceIndexing) {
+            $timeTracker->setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
+            return;
+        }
+
+        if ($tsfe->page['no_search']) {
+            $timeTracker->setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
+            return;
+        }
+        if ($tsfe->no_cache) {
+            $timeTracker->setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
+            return;
+        }
+        /** @var LanguageAspect $languageAspect */
+        $languageAspect = GeneralUtility::makeInstance(Context::class)->getAspect('language');
+        if ($languageAspect->getId() !== $languageAspect->getContentId()) {
+            $timeTracker->setTSlogMessage('Index page? No, languageId was different from contentId which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
+            return;
+        }
+        // Init and start indexing
+        $indexer = GeneralUtility::makeInstance(Indexer::class);
+        $indexer->forceIndexing = $forceIndexing;
+        $indexer->init($this->initializeIndexerConfiguration($tsfe, $languageAspect));
+        $indexer->indexTypo3PageContent();
+        $timeTracker->pull();
+    }
+
+    /**
+     * Setting up internal configuration from config array based on TypoScriptFrontendController
+     * Information about page for which the indexing takes place
+     *
+     * @param TypoScriptFrontendController $tsfe
+     * @param LanguageAspect $languageAspect
+     * @return array
+     */
+    protected function initializeIndexerConfiguration(TypoScriptFrontendController $tsfe, LanguageAspect $languageAspect): array
+    {
+        $pageArguments = $tsfe->getPageArguments();
+        $configuration = [
+            // Page id
+            'id' => $tsfe->id,
+            // Page type
+            'type'=> $tsfe->type,
+            // sys_language UID of the language of the indexing.
+            'sys_language_uid' => $languageAspect->getId(),
+            // MP variable, if any (Mount Points)
+            'MP' => $tsfe->MP,
+            // Group list
+            'gr_list' => implode(',', GeneralUtility::makeInstance(Context::class)->getPropertyFromAspect('frontend.user', 'groupIds', [0, -1])),
+            // page arguments array
+            'staticPageArguments' => $pageArguments ? $pageArguments->getStaticArguments() : [],
+            // The creation date of the TYPO3 page
+            'crdate' => $tsfe->page['crdate'],
+            'rootline_uids' => [],
+        ];
+
+        // Root line uids
+        foreach ($tsfe->config['rootLine'] as $rlkey => $rldat) {
+            $configuration['rootline_uids'][$rlkey] = $rldat['uid'];
+        }
+        // Content of page
+        $configuration['content'] = $tsfe->content;
+        // Content string (HTML of TYPO3 page)
+        $configuration['indexedDocTitle'] = $this->convOutputCharset($tsfe->indexedDocTitle, $tsfe->metaCharset);
+        // Alternative title for indexing
+        $configuration['metaCharset'] = $tsfe->metaCharset;
+        // Character set of content (will be converted to utf-8 during indexing)
+        $configuration['mtime'] = $tsfe->register['SYS_LASTCHANGED'] ?? $tsfe->page['SYS_LASTCHANGED'];
+        // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
+        // Configuration of behavior
+        $configuration['index_externals'] = $tsfe->config['config']['index_externals'];
+        // Whether to index external documents like PDF, DOC etc. (if possible)
+        $configuration['index_descrLgd'] = $tsfe->config['config']['index_descrLgd'];
+        // Length of description text (max 250, default 200)
+        $configuration['index_metatags'] = $tsfe->config['config']['index_metatags'] ?? true;
+        // Set to zero
+        $configuration['recordUid'] = 0;
+        $configuration['freeIndexUid'] = 0;
+        $configuration['freeIndexSetId'] = 0;
+        return $configuration;
+    }
+
+    /**
+     * Converts input string from utf-8 to metaCharset IF the two charsets are different.
+     *
+     * @param string $content Content to be converted.
+     * @param string $metaCharset
+     * @return string Converted content string.
+     */
+    protected function convOutputCharset(string $content, string $metaCharset): string
+    {
+        if ($metaCharset !== 'utf-8') {
+            $charsetConverter = GeneralUtility::makeInstance(CharsetConverter::class);
+            try {
+                $content = $charsetConverter->conv($content, 'utf-8', $metaCharset);
+            } catch (UnknownCharsetException $e) {
+                throw new \RuntimeException('Invalid config.metaCharset: ' . $e->getMessage(), 1508916285);
+            }
+        }
+        return $content;
+    }
 }
index 9ee148b..f955918 100644 (file)
@@ -14,14 +14,10 @@ namespace TYPO3\CMS\IndexedSearch;
  * The TYPO3 project - inspiring people to share!
  */
 
-use Psr\Http\Message\ServerRequestInterface;
 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
-use TYPO3\CMS\Core\Context\Context;
-use TYPO3\CMS\Core\Context\LanguageAspect;
 use TYPO3\CMS\Core\Core\Environment;
 use TYPO3\CMS\Core\Database\Connection;
 use TYPO3\CMS\Core\Database\ConnectionPool;
-use TYPO3\CMS\Core\Routing\PageArguments;
 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
 use TYPO3\CMS\Core\Utility\GeneralUtility;
 use TYPO3\CMS\Core\Utility\MathUtility;
@@ -100,13 +96,6 @@ class Indexer
     public $forceIndexing = false;
 
     /**
-     * If TRUE, indexing is forced despite of hashes etc.
-     *
-     * @var bool
-     */
-    public $crawlerActive = false;
-
-    /**
      * Set when crawler is detected (internal)
      *
      * @var array
@@ -235,206 +224,15 @@ class Indexer
     public function __construct()
     {
         $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
-    }
-
-    /**
-     * Parent Object (TSFE) Initialization
-     *
-     * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
-     */
-    public function hook_indexContent(&$pObj)
-    {
-        // Indexer configuration from Extension Manager interface:
-        $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
-        // Crawler activation:
-        // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
-        if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
-            // Setting simple log message:
-            $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
-            // Setting variables:
-            $this->crawlerActive = true;
-            // Crawler active flag
-            $this->forceIndexing = true;
-        }
-        // Determine if page should be indexed, and if so, configure and initialize indexer
-        if ($pObj->config['config']['index_enable']) {
-            $this->log_push('Index page', '');
-            if (!$disableFrontendIndexing || $this->crawlerActive) {
-                if (!$pObj->page['no_search']) {
-                    if (!$pObj->no_cache) {
-                        /** @var LanguageAspect $languageAspect */
-                        $languageAspect = GeneralUtility::makeInstance(Context::class)->getAspect('language');
-                        if ($languageAspect->getId() === $languageAspect->getContentId()) {
-                            // Setting up internal configuration from config array:
-                            $this->conf = [];
-                            // Information about page for which the indexing takes place
-                            $this->conf['id'] = $pObj->id;
-                            // Page id
-                            $this->conf['type'] = $pObj->type;
-                            // Page type
-                            $this->conf['sys_language_uid'] = $languageAspect->getId();
-                            // sys_language UID of the language of the indexing.
-                            $this->conf['MP'] = $pObj->MP;
-                            // MP variable, if any (Mount Points)
-                            // Group list
-                            $this->conf['gr_list'] = implode(',', GeneralUtility::makeInstance(Context::class)->getPropertyFromAspect('frontend.user', 'groupIds', [0, -1]));
-                            // page arguments array
-                            $this->conf['staticPageArguments'] = [];
-                            if ($GLOBALS['TYPO3_REQUEST'] instanceof ServerRequestInterface) {
-                                /** @var PageArguments $pageArguments */
-                                $pageArguments = $GLOBALS['TYPO3_REQUEST']->getAttribute('routing', null);
-                                if ($pageArguments instanceof PageArguments) {
-                                    $this->conf['staticPageArguments'] = $pageArguments->getStaticArguments();
-                                }
-                            }
-                            // Array of the additional parameters
-                            $this->conf['crdate'] = $pObj->page['crdate'];
-                            // The creation date of the TYPO3 page
-
-                            // Root line uids
-                            $this->conf['rootline_uids'] = [];
-                            foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
-                                $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
-                            }
-                            // Content of page:
-                            $this->conf['content'] = $pObj->content;
-                            // Content string (HTML of TYPO3 page)
-                            $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
-                            // Alternative title for indexing
-                            $this->conf['metaCharset'] = $pObj->metaCharset;
-                            // Character set of content (will be converted to utf-8 during indexing)
-                            $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
-                            // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
-                            // Configuration of behavior:
-                            $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
-                            // Whether to index external documents like PDF, DOC etc. (if possible)
-                            $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
-                            // Length of description text (max 250, default 200)
-                            $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
-                            // Set to zero:
-                            $this->conf['recordUid'] = 0;
-                            $this->conf['freeIndexUid'] = 0;
-                            $this->conf['freeIndexSetId'] = 0;
-                            // Init and start indexing:
-                            $this->init();
-                            $this->indexTypo3PageContent();
-                        } else {
-                            $this->log_setTSlogMessage('Index page? No, languageId was different from contentId which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
-                        }
-                    } else {
-                        $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
-                    }
-                } else {
-                    $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
-                }
-            } else {
-                $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
-            }
-            $this->log_pull();
-        }
-    }
-
-    /****************************
-     *
-     * Backend API
-     *
-     ****************************/
-    /**
-     * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
-     *
-     * @param int $id The page uid, &id=
-     * @param int $type The page type, &type=
-     * @param int $sys_language_uid sys_language uid, typically &L=
-     * @param string $MP The MP variable (Mount Points), &MP=
-     * @param array $uidRL Rootline array of only UIDs.
-     * @param array $queryArguments Array of GET variables to register with this indexing
-     */
-    public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $queryArguments = [])
-    {
-        // Setting up internal configuration from config array:
-        $this->conf = [];
-        // Information about page for which the indexing takes place
-        $this->conf['id'] = $id;
-        // Page id     (int)
-        $this->conf['type'] = $type;
-        // Page type (int)
-        $this->conf['sys_language_uid'] = $sys_language_uid;
-        // sys_language UID of the language of the indexing (int)
-        $this->conf['MP'] = $MP;
-        // MP variable, if any (Mount Points) (string)
-        $this->conf['gr_list'] = '0,-1';
-        // Group list (hardcoded for now...)
-        $this->conf['staticPageArguments'] = $queryArguments;
-        // Set to defaults
-        $this->conf['freeIndexUid'] = 0;
-        $this->conf['freeIndexSetId'] = 0;
-
-        // Root line uids
-        $this->conf['rootline_uids'] = $uidRL;
-        // Configuration of behavior:
-        $this->conf['index_externals'] = 1;
-        // Whether to index external documents like PDF, DOC etc. (if possible)
-        $this->conf['index_descrLgd'] = 200;
-        // Length of description text (max 250, default 200)
-        $this->conf['index_metatags'] = true;
-        // Whether to index document keywords and description (if present)
-        // Init and start indexing:
-        $this->init();
-    }
-
-    /**
-     * Sets the free-index uid. Can be called right after backend_initIndexer()
-     *
-     * @param int $freeIndexUid Free index UID
-     * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
-     */
-    public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
-    {
-        $this->conf['freeIndexUid'] = $freeIndexUid;
-        $this->conf['freeIndexSetId'] = $freeIndexSetId;
-    }
-
-    /**
-     * Indexing records as the content of a TYPO3 page.
-     *
-     * @param string $title Title equivalent
-     * @param string $keywords Keywords equivalent
-     * @param string $description Description equivalent
-     * @param string $content The main content to index
-     * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
-     * @param int $mtime Last modification time, in seconds
-     * @param int $crdate The creation date of the content, in seconds
-     * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
-     */
-    public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
-    {
-        // Content of page:
-        $this->conf['mtime'] = $mtime;
-        // Most recent modification time (seconds) of the content
-        $this->conf['crdate'] = $crdate;
-        // The creation date of the TYPO3 content
-        $this->conf['recordUid'] = $recordUid;
-        // UID of the record, if applicable
-        // Construct fake HTML for parsing:
-        $this->conf['content'] = '
-               <html>
-                       <head>
-                               <title>' . htmlspecialchars($title) . '</title>
-                               <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
-                               <meta name="description" content="' . htmlspecialchars($description) . '" />
-                       </head>
-                       <body>
-                               ' . htmlspecialchars($content) . '
-                       </body>
-               </html>';
-        // Content string (HTML of TYPO3 page)
-        // Initializing charset:
-        $this->conf['metaCharset'] = $charset;
-        // Character set of content (will be converted to utf-8 during indexing)
-        $this->conf['indexedDocTitle'] = '';
-        // Alternative title for indexing
-        // Index content as if it was a TYPO3 page:
-        $this->indexTypo3PageContent();
+        // Indexer configuration from Extension Manager interface
+        $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
+        $this->tstamp_minAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
+        $this->tstamp_maxAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
+        $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
+        $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
+        // Workaround: If the extension configuration was not updated yet, the value is not existing
+        $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
+        $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
     }
 
     /********************************
@@ -442,22 +240,18 @@ class Indexer
      * Initialization
      *
      *******************************/
+
     /**
-     * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
+     * Initializes the object.
+     * @param array|null $configuration will be used to set $this->conf, otherwise $this->conf MUST be set with proper values prior to this call
      */
-    public function init()
+    public function init(array $configuration = null)
     {
+        if (is_array($configuration)) {
+            $this->conf = $configuration;
+        }
         // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
         $this->setT3Hashes();
-        // Indexer configuration from Extension Manager interface:
-        $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
-        $this->tstamp_minAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
-        $this->tstamp_maxAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
-        $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
-        $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
-        // Workaround: If the extension configuration was not updated yet, the value is not existing
-        $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
-        $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
         // Initialize external document parsers:
         // Example configuration, see ext_localconf.php of this file!
         if ($this->conf['index_externals']) {
index 0bb7909..5a1f8eb 100644 (file)
@@ -37,7 +37,7 @@ class IndexerTest extends UnitTestCase
     public function extractHyperLinksDoesNotReturnNonExistingLocalPath()
     {
         $html = 'test <a href="' . StringUtility::getUniqueId() . '">test</a> test';
-        $subject = new Indexer();
+        $subject = $this->getMockBuilder(Indexer::class)->disableOriginalConstructor()->addMethods(['dummy'])->getMock();
         $result = $subject->extractHyperLinks($html);
         self::assertEquals(1, count($result));
         self::assertEquals('', $result[0]['localPath']);
@@ -50,7 +50,7 @@ class IndexerTest extends UnitTestCase
     {
         $baseURL = \TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
         $html = 'test <a href="' . $baseURL . 'index.php">test</a> test';
-        $subject = new Indexer();
+        $subject = $this->getMockBuilder(Indexer::class)->disableOriginalConstructor()->addMethods(['dummy'])->getMock();
         $result = $subject->extractHyperLinks($html);
         self::assertEquals(1, count($result));
         self::assertEquals(Environment::getPublicPath() . '/index.php', $result[0]['localPath']);
@@ -62,7 +62,7 @@ class IndexerTest extends UnitTestCase
     public function extractHyperLinksFindsCorrectPathWithAbsolutePath()
     {
         $html = 'test <a href="index.php">test</a> test';
-        $subject = new Indexer();
+        $subject = $this->getMockBuilder(Indexer::class)->disableOriginalConstructor()->addMethods(['dummy'])->getMock();
         $result = $subject->extractHyperLinks($html);
         self::assertEquals(1, count($result));
         self::assertEquals(Environment::getPublicPath() . '/index.php', $result[0]['localPath']);
@@ -74,7 +74,7 @@ class IndexerTest extends UnitTestCase
     public function extractHyperLinksFindsCorrectPathForPathWithinTypo3Directory()
     {
         $html = 'test <a href="typo3/index.php">test</a> test';
-        $subject = new Indexer();
+        $subject = $this->getMockBuilder(Indexer::class)->disableOriginalConstructor()->addMethods(['dummy'])->getMock();
         $result = $subject->extractHyperLinks($html);
         self::assertEquals(1, count($result));
         self::assertEquals(Environment::getPublicPath() . '/typo3/index.php', $result[0]['localPath']);
@@ -94,7 +94,7 @@ class IndexerTest extends UnitTestCase
             ],
         ];
         $GLOBALS['TSFE']->config = $config;
-        $subject = new Indexer();
+        $subject = $this->getMockBuilder(Indexer::class)->disableOriginalConstructor()->addMethods(['dummy'])->getMock();
         $result = $subject->extractHyperLinks($html);
         self::assertEquals(1, count($result));
         self::assertEquals(Environment::getPublicPath() . '/index.php', $result[0]['localPath']);
@@ -107,7 +107,7 @@ class IndexerTest extends UnitTestCase
     {
         $baseHref = 'http://example.com/';
         $html = '<html><head><Base Href="' . $baseHref . '" /></head></html>';
-        $subject = new Indexer();
+        $subject = $this->getMockBuilder(Indexer::class)->disableOriginalConstructor()->addMethods(['dummy'])->getMock();
         $result = $subject->extractBaseHref($html);
         self::assertEquals($baseHref, $result);
     }
@@ -150,7 +150,7 @@ EOT;
 
 EOT;
 
-        $subject = new Indexer();
+        $subject = $this->getMockBuilder(Indexer::class)->disableOriginalConstructor()->addMethods(['dummy'])->getMock();
         $result = $subject->typoSearchTags($body);
         self::assertTrue($result);
         self::assertEquals($expected, $body);
@@ -206,7 +206,7 @@ EOT;
 
 EOT;
 
-        $subject = new Indexer();
+        $subject = $this->getMockBuilder(Indexer::class)->disableOriginalConstructor()->addMethods(['dummy'])->getMock();
         $result = $subject->typoSearchTags($body);
         self::assertTrue($result);
         self::assertEquals($expected, $body);
index dd16033..ccbd257 100644 (file)
@@ -10,7 +10,7 @@ defined('TYPO3_MODE') or die();
 );
 
 // Attach to hooks:
-$GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['tslib/class.tslib_fe.php']['pageIndexing'][] = \TYPO3\CMS\IndexedSearch\Indexer::class;
+$GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['tslib/class.tslib_fe.php']['pageIndexing'][] = \TYPO3\CMS\IndexedSearch\Hook\TypoScriptFrontendHook::class;
 $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['tslib/class.tslib_fe.php']['headerNoCache']['tx_indexedsearch'] = \TYPO3\CMS\IndexedSearch\Hook\TypoScriptFrontendHook::class . '->headerNoCache';
 // Register with "crawler" extension:
 $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions']['indexed_search'] = [