Commit f08d8a04 authored by Benni Mack's avatar Benni Mack Committed by Georg Ringer
Browse files

[!!!][TASK] Drop dependency to EXT:crawler

Indexed Search provides Crawler API hooks, not allowing
EXT:crawler or any other extension to properly queue
indexing processing.

In order to add a new external queuing / indexing system
the existing crawler integration from 2006 is removed.

A new feature is already in review, see https://review.typo3.org/c/Packages/TYPO3.CMS/+/64790

Resolves: #93110
Releases: master
Change-Id: Ib8823ec60830685e93fdc6957ec7f3f154002038
Reviewed-on: https://review.typo3.org/c/Packages/TYPO3.CMS/+/67183

Tested-by: Tizian Schmidlin's avatarTizian Schmidlin <st@cabag.ch>
Tested-by: default avatarTYPO3com <noreply@typo3.com>
Tested-by: Christian Kuhn's avatarChristian Kuhn <lolli@schwarzbu.ch>
Tested-by: Tomas Norre Mikkelsen's avatarTomas Norre Mikkelsen <tomasnorre@gmail.com>
Tested-by: Georg Ringer's avatarGeorg Ringer <georg.ringer@gmail.com>
Reviewed-by: Tizian Schmidlin's avatarTizian Schmidlin <st@cabag.ch>
Reviewed-by: Christian Kuhn's avatarChristian Kuhn <lolli@schwarzbu.ch>
Reviewed-by: Tomas Norre Mikkelsen's avatarTomas Norre Mikkelsen <tomasnorre@gmail.com>
Reviewed-by: Georg Ringer's avatarGeorg Ringer <georg.ringer@gmail.com>
parent 50f62bae
.. include:: ../../Includes.txt
===============================================================================
Breaking: #93110 - Indexed search does not provide hook for EXT:crawler anymore
===============================================================================
See :issue:`93110`
Description
===========
Indexed search had an explicit dependency on an old API of
the third-party extension "crawler". This cross-dependency did
not allow either component to move forward.
In order to build a new solution, legacy code has been removed
without substitution for the time being, where as new code
will be added during further TYPO3 v11 development.
Impact
======
TYPO3 v11 does not use existing EXT:crawler hooks and APIs anymore.
Affected Installations
======================
TYPO3 installations using EXT:crawler and EXT:indexed_search.
Migration
=========
None until a more flexible solution is provided, however
this only affects the maintainers of EXT:crawler.
.. index:: PHP-API, NotScanned, ext:indexed_search
\ No newline at end of file
......@@ -26,6 +26,7 @@ use TYPO3\CMS\Core\Utility\PathUtility;
/**
* External standard parsers for indexed_search
* MUST RETURN utf-8 content!
* @internal will be removed, in favor of unified Content Extractor API.
*/
class FileContentParser
{
......
<?php
/*
* This file is part of the TYPO3 CMS project.
*
* It is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, either version 2
* of the License, or any later version.
*
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*
* The TYPO3 project - inspiring people to share!
*/
namespace TYPO3\CMS\IndexedSearch\Hook;
use TYPO3\CMS\Core\Utility\GeneralUtility;
use TYPO3\CMS\IndexedSearch\Indexer;
/**
* Crawler hook for indexed search. Works with the "crawler" extension
* This hook is specifically used to index external files found on pages through the crawler extension.
* @see \TYPO3\CMS\IndexedSearch\Indexer::extractLinks()
* @internal this is a TYPO3-internal hook implementation and not part of TYPO3's Core API.
*/
class CrawlerFilesHook
{
/**
* Call back function for execution of a log element
*
* @param array $params Params from log element.
* @param object $pObj Parent object (tx_crawler lib)
* @return array|null Result array
*/
public function crawler_execute($params, &$pObj)
{
if (!is_array($params['conf'])) {
return;
}
$indexerObj = GeneralUtility::makeInstance(Indexer::class);
$indexerObj->init($params['conf']);
// Index document:
if ($params['alturl']) {
$fI = pathinfo($params['document']);
$ext = strtolower($fI['extension']);
$indexerObj->indexRegularDocument($params['alturl'], true, $params['document'], $ext);
} else {
$indexerObj->indexRegularDocument($params['document'], true);
}
// Return OK:
return ['content' => []];
}
}
......@@ -31,24 +31,6 @@ use TYPO3\CMS\IndexedSearch\Indexer;
*/
class TypoScriptFrontendHook
{
/**
* Frontend hook: If the page is not being re-generated this is our chance to force it to be
* (because re-generation of the page is required in order to have the indexer called!)
*
* @param array $params Parameters from frontend
* @param TypoScriptFrontendController $tsfe TSFE object
*/
public function headerNoCache(array &$params, TypoScriptFrontendController $tsfe)
{
// Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
if (in_array('tx_indexedsearch_reindex', $tsfe->applicationData['tx_crawler']['parameters']['procInstructions'] ?? [], true)) {
// Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached.
$params['disableAcquireCacheData'] = true;
// Enable indexing
$tsfe->applicationData['forceIndexing'] = true;
}
}
/**
* Trigger indexing of content, after evaluating if this page could / should be indexed.
*
......
......@@ -22,16 +22,16 @@ use TYPO3\CMS\Core\Database\ConnectionPool;
use TYPO3\CMS\Core\Html\HtmlParser;
use TYPO3\CMS\Core\Http\RequestFactory;
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
use TYPO3\CMS\Core\Utility\GeneralUtility;
use TYPO3\CMS\Core\Utility\MathUtility;
use TYPO3\CMS\Core\Utility\PathUtility;
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
use TYPO3\CMS\IndexedSearch\Hook\CrawlerFilesHook;
use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
/**
* Indexing class for TYPO3 frontend
*
* @internal
*/
class Indexer
{
......@@ -533,16 +533,8 @@ class Indexer
*/
public function extractLinks($content)
{
$crawler = null;
// Get links:
$list = $this->extractHyperLinks($content);
if ($this->indexerConfig['useCrawlerForExternalFiles'] && ExtensionManagementUtility::isLoaded('crawler')) {
/**
* todo: remove dependency to class tx_crawler_lib
* @link https://forge.typo3.org/issues/83603
*/
$crawler = GeneralUtility::makeInstance('tx_crawler_lib');
}
// Traverse links:
foreach ($list as $linkInfo) {
// Decode entities:
......@@ -577,30 +569,9 @@ class Indexer
if ($linkInfo['localPath']) {
$fI = pathinfo($linkSource);
$ext = strtolower($fI['extension']);
if (is_object($crawler)) {
$params = [
'document' => $linkSource,
'alturl' => $linkInfo['href'],
'conf' => $this->conf
];
unset($params['conf']['content']);
$crawler->addQueueEntry_callBack(0, $params, CrawlerFilesHook::class, $this->conf['id']);
$this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
} else {
$this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
}
$this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
} else {
if (is_object($crawler)) {
$params = [
'document' => $linkSource,
'conf' => $this->conf
];
unset($params['conf']['content']);
$crawler->addQueueEntry_callBack(0, $params, CrawlerFilesHook::class, $this->conf['id']);
$this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
} else {
$this->indexRegularDocument($linkSource);
}
$this->indexRegularDocument($linkSource);
}
}
}
......
......@@ -10,18 +10,8 @@ defined('TYPO3') or die();
[\TYPO3\CMS\IndexedSearch\Controller\SearchController::class => 'form,search']
);
// Attach to hooks:
$GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['tslib/class.tslib_fe.php']['contentPostProc-cached']['indexed_search'] = \TYPO3\CMS\IndexedSearch\Hook\TypoScriptFrontendHook::class . '->indexPageContent';
$GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['tslib/class.tslib_fe.php']['headerNoCache']['tx_indexedsearch'] = \TYPO3\CMS\IndexedSearch\Hook\TypoScriptFrontendHook::class . '->headerNoCache';
// Register with "crawler" extension:
$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions']['indexed_search'] = [
'key' => 'tx_indexedsearch_reindex',
'value' => 'Re-indexing'
];
$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks']['tx_indexedsearch_crawl'] = \TYPO3\CMS\IndexedSearch\Hook\CrawlerHook::class;
// Register with DataHandler:
$GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['processCmdmapClass']['tx_indexedsearch'] = \TYPO3\CMS\IndexedSearch\Hook\CrawlerHook::class;
$GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['processDatamapClass']['tx_indexedsearch'] = \TYPO3\CMS\IndexedSearch\Hook\CrawlerHook::class;
// Configure default document parsers:
$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] = [
'pdf' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment