[TASK] Doctrine: Migrate indexed_search part 1
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Database\ConnectionPool;
18 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
19 use TYPO3\CMS\Core\Utility\GeneralUtility;
20 use TYPO3\CMS\Core\Utility\MathUtility;
21 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
22 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
23
24 /**
25 * Indexing class for TYPO3 frontend
26 */
27 class Indexer
28 {
29 /**
30 * @var array
31 */
32 public $reasons = array(
33 -1 => 'mtime matched the document, so no changes detected and no content updated',
34 -2 => 'The minimum age was not exceeded',
35 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
36 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
37 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
38 4 => 'Page has never been indexed (is not represented in the index_phash table).'
39 );
40
41 /**
42 * HTML code blocks to exclude from indexing
43 *
44 * @var string
45 */
46 public $excludeSections = 'script,style';
47
48 /**
49 * Supported Extensions for external files
50 *
51 * @var array
52 */
53 public $external_parsers = array();
54
55 /**
56 * External parser objects, keys are file extension names. Values are objects with certain methods.
57 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
58 * in access limited pages!)
59 *
60 * @var string
61 */
62 public $defaultGrList = '0,-1';
63
64 /**
65 * Min/Max times
66 *
67 * @var int
68 */
69 public $tstamp_maxAge = 0;
70
71 /**
72 * If set, this tells a number of seconds that is the maximum age of an indexed document.
73 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
74 *
75 * @var int
76 */
77 public $tstamp_minAge = 0;
78
79 /**
80 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
81 *
82 * @var int
83 */
84 public $maxExternalFiles = 0;
85
86 /**
87 * Max number of external files to index.
88 *
89 * @var bool
90 */
91 public $forceIndexing = false;
92
93 /**
94 * If TRUE, indexing is forced despite of hashes etc.
95 *
96 * @var bool
97 */
98 public $crawlerActive = false;
99
100 /**
101 * Set when crawler is detected (internal)
102 *
103 * @var array
104 */
105 public $defaultContentArray = array(
106 'title' => '',
107 'description' => '',
108 'keywords' => '',
109 'body' => ''
110 );
111
112 /**
113 * @var int
114 */
115 public $wordcount = 0;
116
117 /**
118 * @var int
119 */
120 public $externalFileCounter = 0;
121
122 /**
123 * @var array
124 */
125 public $conf = array();
126
127 /**
128 * Configuration set internally (see init functions for required keys and their meaning)
129 *
130 * @var array
131 */
132 public $indexerConfig = array();
133
134 /**
135 * Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
136 *
137 * @var array
138 */
139 public $hash = array();
140
141 /**
142 * Hash array, contains phash and phash_grouping
143 *
144 * @var array
145 */
146 public $file_phash_arr = array();
147
148 /**
149 * Hash array for files
150 *
151 * @var array
152 */
153 public $contentParts = array();
154
155 /**
156 * Content of TYPO3 page
157 *
158 * @var string
159 */
160 public $content_md5h = '';
161
162 /**
163 * @var array
164 */
165 public $internal_log = array();
166
167 /**
168 * Internal log
169 *
170 * @var string
171 */
172 public $indexExternalUrl_content = '';
173
174 /**
175 * @var array
176 */
177 public $cHashParams = array();
178
179 /**
180 * cHashparams array
181 *
182 * @var int
183 */
184 public $freqRange = 32000;
185
186 /**
187 * @var float
188 */
189 public $freqMax = 0.1;
190
191 /**
192 * @var bool
193 */
194 public $enableMetaphoneSearch = false;
195
196 /**
197 * @var bool
198 */
199 public $storeMetaphoneInfoAsWords;
200
201 /**
202 * @var string
203 */
204 public $metaphoneContent = '';
205
206 /**
207 * Charset class object
208 *
209 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
210 */
211 public $csObj;
212
213 /**
214 * Metaphone object, if any
215 *
216 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
217 */
218 public $metaphoneObj;
219
220 /**
221 * Lexer object for word splitting
222 *
223 * @var \TYPO3\CMS\IndexedSearch\Lexer
224 */
225 public $lexerObj;
226
227 /**
228 * @var bool
229 */
230 public $flagBitMask;
231
232 /**
233 * @var TimeTracker
234 */
235 protected $timeTracker;
236
237 /**
238 * Indexer constructor.
239 */
240 public function __construct()
241 {
242 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
243 }
244
245 /**
246 * Parent Object (TSFE) Initialization
247 *
248 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
249 * @return void
250 */
251 public function hook_indexContent(&$pObj)
252 {
253 // Indexer configuration from Extension Manager interface:
254 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
255 // Crawler activation:
256 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
257 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
258 // Setting simple log message:
259 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
260 // Setting variables:
261 $this->crawlerActive = true;
262 // Crawler active flag
263 $this->forceIndexing = true;
264 }
265 // Determine if page should be indexed, and if so, configure and initialize indexer
266 if ($pObj->config['config']['index_enable']) {
267 $this->log_push('Index page', '');
268 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
269 if (!$pObj->page['no_search']) {
270 if (!$pObj->no_cache) {
271 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
272 // Setting up internal configuration from config array:
273 $this->conf = array();
274 // Information about page for which the indexing takes place
275 $this->conf['id'] = $pObj->id;
276 // Page id
277 $this->conf['type'] = $pObj->type;
278 // Page type
279 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
280 // sys_language UID of the language of the indexing.
281 $this->conf['MP'] = $pObj->MP;
282 // MP variable, if any (Mount Points)
283 $this->conf['gr_list'] = $pObj->gr_list;
284 // Group list
285 $this->conf['cHash'] = $pObj->cHash;
286 // cHash string for additional parameters
287 $this->conf['cHash_array'] = $pObj->cHash_array;
288 // Array of the additional parameters
289 $this->conf['crdate'] = $pObj->page['crdate'];
290 // The creation date of the TYPO3 page
291 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
292 // reg1 of the caching table. Not known what practical use this has.
293 // Root line uids
294 $this->conf['rootline_uids'] = array();
295 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
296 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
297 }
298 // Content of page:
299 $this->conf['content'] = $pObj->content;
300 // Content string (HTML of TYPO3 page)
301 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
302 // Alternative title for indexing
303 $this->conf['metaCharset'] = $pObj->metaCharset;
304 // Character set of content (will be converted to utf-8 during indexing)
305 $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
306 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
307 // Configuration of behavior:
308 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
309 // Whether to index external documents like PDF, DOC etc. (if possible)
310 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
311 // Length of description text (max 250, default 200)
312 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
313 // Set to zero:
314 $this->conf['recordUid'] = 0;
315 $this->conf['freeIndexUid'] = 0;
316 $this->conf['freeIndexSetId'] = 0;
317 // Init and start indexing:
318 $this->init();
319 $this->indexTypo3PageContent();
320 } else {
321 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
322 }
323 } else {
324 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
325 }
326 } else {
327 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
328 }
329 } else {
330 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
331 }
332 $this->log_pull();
333 }
334 }
335
336 /****************************
337 *
338 * Backend API
339 *
340 ****************************/
341 /**
342 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
343 *
344 * @param int $id The page uid, &id=
345 * @param int $type The page type, &type=
346 * @param int $sys_language_uid sys_language uid, typically &L=
347 * @param string $MP The MP variable (Mount Points), &MP=
348 * @param array $uidRL Rootline array of only UIDs.
349 * @param array $cHash_array Array of GET variables to register with this indexing
350 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
351 * @return void
352 */
353 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = false)
354 {
355 // Setting up internal configuration from config array:
356 $this->conf = array();
357 // Information about page for which the indexing takes place
358 $this->conf['id'] = $id;
359 // Page id (int)
360 $this->conf['type'] = $type;
361 // Page type (int)
362 $this->conf['sys_language_uid'] = $sys_language_uid;
363 // sys_language UID of the language of the indexing (int)
364 $this->conf['MP'] = $MP;
365 // MP variable, if any (Mount Points) (string)
366 $this->conf['gr_list'] = '0,-1';
367 // Group list (hardcoded for now...)
368 // cHash values:
369 if ($createCHash) {
370 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
371 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
372 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
373 } else {
374 $this->conf['cHash'] = '';
375 }
376 // cHash string for additional parameters
377 $this->conf['cHash_array'] = $cHash_array;
378 // Array of the additional parameters
379 // Set to defaults
380 $this->conf['freeIndexUid'] = 0;
381 $this->conf['freeIndexSetId'] = 0;
382 $this->conf['page_cache_reg1'] = '';
383 // Root line uids
384 $this->conf['rootline_uids'] = $uidRL;
385 // Configuration of behavior:
386 $this->conf['index_externals'] = 1;
387 // Whether to index external documents like PDF, DOC etc. (if possible)
388 $this->conf['index_descrLgd'] = 200;
389 // Length of description text (max 250, default 200)
390 $this->conf['index_metatags'] = true;
391 // Whether to index document keywords and description (if present)
392 // Init and start indexing:
393 $this->init();
394 }
395
396 /**
397 * Sets the free-index uid. Can be called right after backend_initIndexer()
398 *
399 * @param int $freeIndexUid Free index UID
400 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
401 * @return void
402 */
403 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
404 {
405 $this->conf['freeIndexUid'] = $freeIndexUid;
406 $this->conf['freeIndexSetId'] = $freeIndexSetId;
407 }
408
409 /**
410 * Indexing records as the content of a TYPO3 page.
411 *
412 * @param string $title Title equivalent
413 * @param string $keywords Keywords equivalent
414 * @param string $description Description equivalent
415 * @param string $content The main content to index
416 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
417 * @param int $mtime Last modification time, in seconds
418 * @param int $crdate The creation date of the content, in seconds
419 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
420 * @return void
421 */
422 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
423 {
424 // Content of page:
425 $this->conf['mtime'] = $mtime;
426 // Most recent modification time (seconds) of the content
427 $this->conf['crdate'] = $crdate;
428 // The creation date of the TYPO3 content
429 $this->conf['recordUid'] = $recordUid;
430 // UID of the record, if applicable
431 // Construct fake HTML for parsing:
432 $this->conf['content'] = '
433 <html>
434 <head>
435 <title>' . htmlspecialchars($title) . '</title>
436 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
437 <meta name="description" content="' . htmlspecialchars($description) . '" />
438 </head>
439 <body>
440 ' . htmlspecialchars($content) . '
441 </body>
442 </html>';
443 // Content string (HTML of TYPO3 page)
444 // Initializing charset:
445 $this->conf['metaCharset'] = $charset;
446 // Character set of content (will be converted to utf-8 during indexing)
447 $this->conf['indexedDocTitle'] = '';
448 // Alternative title for indexing
449 // Index content as if it was a TYPO3 page:
450 $this->indexTypo3PageContent();
451 }
452
453 /********************************
454 *
455 * Initialization
456 *
457 *******************************/
458 /**
459 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
460 *
461 * @return void
462 */
463 public function init()
464 {
465 // Initializing:
466 $this->cHashParams = $this->conf['cHash_array'];
467 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
468 if ($this->conf['cHash']) {
469 // Add this so that URL's come out right...
470 $this->cHashParams['cHash'] = $this->conf['cHash'];
471 }
472 unset($this->cHashParams['encryptionKey']);
473 }
474 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
475 $this->setT3Hashes();
476 // Indexer configuration from Extension Manager interface:
477 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
478 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
479 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
480 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
481 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
482 // Workaround: If the extension configuration was not updated yet, the value is not existing
483 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
484 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
485 // Initialize external document parsers:
486 // Example configuration, see ext_localconf.php of this file!
487 if ($this->conf['index_externals']) {
488 $this->initializeExternalParsers();
489 }
490 // Initialize lexer (class that deconstructs the text into words):
491 $lexerObjRef = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
492 $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
493 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
494 // Initialize metaphone hook:
495 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
496 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
497 $this->metaphoneObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
498 $this->metaphoneObj->pObj = $this;
499 }
500 // Init charset class:
501 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
502 }
503
504 /**
505 * Initialize external parsers
506 *
507 * @return void
508 * @access private
509 * @see init()
510 */
511 public function initializeExternalParsers()
512 {
513 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'])) {
514 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
515 $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
516 $this->external_parsers[$extension]->pObj = $this;
517 // Init parser and if it returns FALSE, unset its entry again:
518 if (!$this->external_parsers[$extension]->initParser($extension)) {
519 unset($this->external_parsers[$extension]);
520 }
521 }
522 }
523 }
524
525 /********************************
526 *
527 * Indexing; TYPO3 pages (HTML content)
528 *
529 *******************************/
530 /**
531 * Start indexing of the TYPO3 page
532 *
533 * @return void
534 */
535 public function indexTypo3PageContent()
536 {
537 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
538 $is_grlist = $this->is_grlist_set($this->hash['phash']);
539 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
540 // Setting message:
541 if ($this->forceIndexing) {
542 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
543 } elseif ($check > 0) {
544 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
545 } else {
546 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
547 }
548 // Divide into title,keywords,description and body:
549 $this->log_push('Split content', '');
550 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
551 if ($this->conf['indexedDocTitle']) {
552 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
553 }
554 $this->log_pull();
555 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
556 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
557 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
558 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
559 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
560 $checkCHash = $this->checkContentHash();
561 if (!is_array($checkCHash) || $check === 1) {
562 $Pstart = GeneralUtility::milliseconds();
563 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
564 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
565 $this->log_pull();
566 // Splitting words
567 $this->log_push('Extract words from content', '');
568 $splitInWords = $this->processWordsInArrays($this->contentParts);
569 $this->log_pull();
570 // Analyse the indexed words.
571 $this->log_push('Analyse the extracted words', '');
572 $indexArr = $this->indexAnalyze($splitInWords);
573 $this->log_pull();
574 // Submitting page (phash) record
575 $this->log_push('Submitting page', '');
576 $this->submitPage();
577 $this->log_pull();
578 // Check words and submit to word list if not there
579 $this->log_push('Check word list and submit words', '');
580 if (IndexedSearchUtility::isTableUsed('index_words')) {
581 $this->checkWordList($indexArr);
582 $this->submitWords($indexArr, $this->hash['phash']);
583 }
584 $this->log_pull();
585 // Set parsetime
586 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
587 // Checking external files if configured for.
588 $this->log_push('Checking external files', '');
589 if ($this->conf['index_externals']) {
590 $this->extractLinks($this->conf['content']);
591 }
592 $this->log_pull();
593 } else {
594 // Update the timestamp
595 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
596 $this->updateSetId($this->hash['phash']);
597 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
598 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
599 $this->updateRootline();
600 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
601 }
602 } else {
603 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
604 }
605 }
606
607 /**
608 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
609 *
610 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
611 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
612 * @see splitRegularContent()
613 */
614 public function splitHTMLContent($content)
615 {
616 // divide head from body ( u-ouh :) )
617 $contentArr = $this->defaultContentArray;
618 $contentArr['body'] = stristr($content, '<body');
619 $headPart = substr($content, 0, -strlen($contentArr['body']));
620 // get title
621 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
622 $titleParts = explode(':', $contentArr['title'], 2);
623 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
624 // get keywords and description metatags
625 if ($this->conf['index_metatags']) {
626 $meta = array();
627 $i = 0;
628 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
629 $i++;
630 }
631 // @todo The code below stops at first unset tag. Is that correct?
632 for ($i = 0; isset($meta[$i]); $i++) {
633 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
634 if (stristr($meta[$i]['name'], 'keywords')) {
635 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
636 }
637 if (stristr($meta[$i]['name'], 'description')) {
638 $contentArr['description'] .= ',' . $meta[$i]['content'];
639 }
640 }
641 }
642 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
643 $this->typoSearchTags($contentArr['body']);
644 // Get rid of unwanted sections (ie. scripting and style stuff) in body
645 $tagList = explode(',', $this->excludeSections);
646 foreach ($tagList as $tag) {
647 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
648 }
649 }
650 // remove tags, but first make sure we don't concatenate words by doing it
651 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
652 $contentArr['body'] = trim(strip_tags($contentArr['body']));
653 $contentArr['keywords'] = trim($contentArr['keywords']);
654 $contentArr['description'] = trim($contentArr['description']);
655 // Return array
656 return $contentArr;
657 }
658
659 /**
660 * Extract the charset value from HTML meta tag.
661 *
662 * @param string $content HTML content
663 * @return string The charset value if found.
664 */
665 public function getHTMLcharset($content)
666 {
667 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
668 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
669 return $reg2[1];
670 }
671 }
672 }
673
674 /**
675 * Converts a HTML document to utf-8
676 *
677 * @param string $content HTML content, any charset
678 * @param string $charset Optional charset (otherwise extracted from HTML)
679 * @return string Converted HTML
680 */
681 public function convertHTMLToUtf8($content, $charset = '')
682 {
683 // Find charset:
684 $charset = $charset ?: $this->getHTMLcharset($content);
685 $charset = $this->csObj->parse_charset($charset);
686 // Convert charset:
687 if ($charset && $charset !== 'utf-8') {
688 $content = $this->csObj->conv($content, $charset, 'utf-8');
689 }
690 // Convert entities, assuming document is now UTF-8:
691 return $this->csObj->entities_to_utf8($content);
692 }
693
694 /**
695 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
696 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
697 * <title> of document or removing <script>-sections
698 *
699 * @param string $string String to search in
700 * @param string $tagName Tag name, eg. "script
701 * @param string $tagContent Passed by reference: Content inside found tag
702 * @param string $stringAfter Passed by reference: Content after found tag
703 * @param string $paramList Passed by reference: Attributes of the found tag.
704 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
705 */
706 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
707 {
708 $endTag = '</' . $tagName . '>';
709 $startTag = '<' . $tagName;
710 // stristr used because we want a case-insensitive search for the tag.
711 $isTagInText = stristr($string, $startTag);
712 // if the tag was not found, return FALSE
713 if (!$isTagInText) {
714 return false;
715 }
716 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
717 $afterTagInText = stristr($isTagInText, $endTag);
718 if ($afterTagInText) {
719 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
720 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
721 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
722 } else {
723 $tagContent = '';
724 $stringAfter = $isTagInText;
725 }
726 return true;
727 }
728
729 /**
730 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
731 *
732 * @param string $body HTML Content, passed by reference
733 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
734 */
735 public function typoSearchTags(&$body)
736 {
737 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
738 if (count($expBody) > 1) {
739 $body = '';
740 foreach ($expBody as $val) {
741 $part = explode('-->', $val, 2);
742 if (trim($part[0]) == 'begin') {
743 $body .= $part[1];
744 $prev = '';
745 } elseif (trim($part[0]) == 'end') {
746 $body .= $prev;
747 } else {
748 $prev = $val;
749 }
750 }
751 return true;
752 } else {
753 return false;
754 }
755 }
756
757 /**
758 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
759 *
760 * @param string $content HTML content
761 * @return void
762 */
763 public function extractLinks($content)
764 {
765 // Get links:
766 $list = $this->extractHyperLinks($content);
767 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
768 $this->includeCrawlerClass();
769 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
770 }
771 // Traverse links:
772 foreach ($list as $linkInfo) {
773 // Decode entities:
774 if ($linkInfo['localPath']) {
775 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
776 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
777 } else {
778 $linkSource = htmlspecialchars_decode($linkInfo['href']);
779 }
780 // Parse URL:
781 $qParts = parse_url($linkSource);
782 // Check for jumpurl (TYPO3 specific thing...)
783 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
784 parse_str($qParts['query'], $getP);
785 $linkSource = $getP['jumpurl'];
786 $qParts = parse_url($linkSource);
787 }
788 if (!$linkInfo['localPath'] && $qParts['scheme']) {
789 if ($this->indexerConfig['indexExternalURLs']) {
790 // Index external URL (http or otherwise)
791 $this->indexExternalUrl($linkSource);
792 }
793 } elseif (!$qParts['query']) {
794 $linkSource = urldecode($linkSource);
795 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
796 $localFile = $linkSource;
797 } else {
798 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
799 }
800 if ($localFile && @is_file($localFile)) {
801 // Index local file:
802 if ($linkInfo['localPath']) {
803 $fI = pathinfo($linkSource);
804 $ext = strtolower($fI['extension']);
805 if (is_object($crawler)) {
806 $params = array(
807 'document' => $linkSource,
808 'alturl' => $linkInfo['href'],
809 'conf' => $this->conf
810 );
811 unset($params['conf']['content']);
812 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
813 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
814 } else {
815 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
816 }
817 } else {
818 if (is_object($crawler)) {
819 $params = array(
820 'document' => $linkSource,
821 'conf' => $this->conf
822 );
823 unset($params['conf']['content']);
824 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
825 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
826 } else {
827 $this->indexRegularDocument($linkSource);
828 }
829 }
830 }
831 }
832 }
833 }
834
835 /**
836 * Extracts all links to external documents from the HTML content string
837 *
838 * @param string $html
839 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
840 * @see extractLinks()
841 */
842 public function extractHyperLinks($html)
843 {
844 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
845 $htmlParts = $htmlParser->splitTags('a', $html);
846 $hyperLinksData = array();
847 foreach ($htmlParts as $index => $tagData) {
848 if ($index % 2 !== 0) {
849 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
850 $firstTagName = $htmlParser->getFirstTagName($tagData);
851 if (strtolower($firstTagName) === 'a') {
852 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
853 $hyperLinksData[] = array(
854 'tag' => $tagData,
855 'href' => $tagAttributes[0]['href'],
856 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
857 );
858 }
859 }
860 }
861 }
862 return $hyperLinksData;
863 }
864
865 /**
866 * Extracts the "base href" from content string.
867 *
868 * @param string $html Content to analyze
869 * @return string The base href or an empty string if not found
870 */
871 public function extractBaseHref($html)
872 {
873 $href = '';
874 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
875 $htmlParts = $htmlParser->splitTags('base', $html);
876 foreach ($htmlParts as $index => $tagData) {
877 if ($index % 2 !== 0) {
878 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
879 $firstTagName = $htmlParser->getFirstTagName($tagData);
880 if (strtolower($firstTagName) === 'base') {
881 $href = $tagAttributes[0]['href'];
882 if ($href) {
883 break;
884 }
885 }
886 }
887 }
888 return $href;
889 }
890
891 /******************************************
892 *
893 * Indexing; external URL
894 *
895 ******************************************/
896 /**
897 * Index External URLs HTML content
898 *
899 * @param string $externalUrl URL, eg. "http://typo3.org/
900 * @return void
901 * @see indexRegularDocument()
902 */
903 public function indexExternalUrl($externalUrl)
904 {
905 // Parse External URL:
906 $qParts = parse_url($externalUrl);
907 $fI = pathinfo($qParts['path']);
908 $ext = strtolower($fI['extension']);
909 // Get headers:
910 $urlHeaders = $this->getUrlHeaders($externalUrl);
911 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
912 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
913 if ((string)$content !== '') {
914 // Create temporary file:
915 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
916 if ($tmpFile) {
917 GeneralUtility::writeFile($tmpFile, $content);
918 // Index that file:
919 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
920 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
921 unlink($tmpFile);
922 }
923 }
924 }
925 }
926
927 /**
928 * Getting HTTP request headers of URL
929 *
930 * @param string $url The URL
931 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
932 */
933 public function getUrlHeaders($url)
934 {
935 // Try to get the headers only
936 $content = GeneralUtility::getUrl($url, 2);
937 if ((string)$content !== '') {
938 // Compile headers:
939 $headers = GeneralUtility::trimExplode(LF, $content, true);
940 $retVal = array();
941 foreach ($headers as $line) {
942 if (trim($line) === '') {
943 break;
944 }
945 list($headKey, $headValue) = explode(':', $line, 2);
946 $retVal[$headKey] = $headValue;
947 }
948 return $retVal;
949 }
950 }
951
952 /**
953 * Checks if the file is local
954 *
955 * @param string $sourcePath
956 * @return string Absolute path to file if file is local, else empty string
957 */
958 protected function createLocalPath($sourcePath)
959 {
960 $localPath = '';
961 static $pathFunctions = array(
962 'createLocalPathFromT3vars',
963 'createLocalPathUsingAbsRefPrefix',
964 'createLocalPathUsingDomainURL',
965 'createLocalPathFromAbsoluteURL',
966 'createLocalPathFromRelativeURL'
967 );
968 foreach ($pathFunctions as $functionName) {
969 $localPath = $this->{$functionName}($sourcePath);
970 if ($localPath != '') {
971 break;
972 }
973 }
974 return $localPath;
975 }
976
977 /**
978 * Attempts to create a local file path from T3VARs. This is useful for
979 * various download extensions that hide actual file name but still want the
980 * file to be indexed.
981 *
982 * @param string $sourcePath
983 * @return string
984 */
985 protected function createLocalPathFromT3vars($sourcePath)
986 {
987 $localPath = '';
988 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
989 if (is_array($indexLocalFiles)) {
990 $md5 = GeneralUtility::shortMD5($sourcePath);
991 // Note: not using self::isAllowedLocalFile here because this method
992 // is allowed to index files outside of the web site (for example,
993 // protected downloads)
994 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
995 $localPath = $indexLocalFiles[$md5];
996 }
997 }
998 return $localPath;
999 }
1000
1001 /**
1002 * Attempts to create a local file path by matching a current request URL.
1003 *
1004 * @param string $sourcePath
1005 * @return string
1006 */
1007 protected function createLocalPathUsingDomainURL($sourcePath)
1008 {
1009 $localPath = '';
1010 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1011 $baseURLLength = strlen($baseURL);
1012 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1013 $sourcePath = substr($sourcePath, $baseURLLength);
1014 $localPath = PATH_site . $sourcePath;
1015 if (!self::isAllowedLocalFile($localPath)) {
1016 $localPath = '';
1017 }
1018 }
1019 return $localPath;
1020 }
1021
1022 /**
1023 * Attempts to create a local file path by matching absRefPrefix. This
1024 * requires TSFE. If TSFE is missing, this function does nothing.
1025 *
1026 * @param string $sourcePath
1027 * @return string
1028 */
1029 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1030 {
1031 $localPath = '';
1032 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1033 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1034 $absRefPrefixLength = strlen($absRefPrefix);
1035 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1036 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1037 $localPath = PATH_site . $sourcePath;
1038 if (!self::isAllowedLocalFile($localPath)) {
1039 $localPath = '';
1040 }
1041 }
1042 }
1043 return $localPath;
1044 }
1045
1046 /**
1047 * Attempts to create a local file path from the absolute URL without
1048 * schema.
1049 *
1050 * @param string $sourcePath
1051 * @return string
1052 */
1053 protected function createLocalPathFromAbsoluteURL($sourcePath)
1054 {
1055 $localPath = '';
1056 if ($sourcePath[0] == '/') {
1057 $sourcePath = substr($sourcePath, 1);
1058 $localPath = PATH_site . $sourcePath;
1059 if (!self::isAllowedLocalFile($localPath)) {
1060 $localPath = '';
1061 }
1062 }
1063 return $localPath;
1064 }
1065
1066 /**
1067 * Attempts to create a local file path from the relative URL.
1068 *
1069 * @param string $sourcePath
1070 * @return string
1071 */
1072 protected function createLocalPathFromRelativeURL($sourcePath)
1073 {
1074 $localPath = '';
1075 if (self::isRelativeURL($sourcePath)) {
1076 $localPath = PATH_site . $sourcePath;
1077 if (!self::isAllowedLocalFile($localPath)) {
1078 $localPath = '';
1079 }
1080 }
1081 return $localPath;
1082 }
1083
1084 /**
1085 * Checks if URL is relative.
1086 *
1087 * @param string $url
1088 * @return bool
1089 */
1090 protected static function isRelativeURL($url)
1091 {
1092 $urlParts = @parse_url($url);
1093 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1094 }
1095
1096 /**
1097 * Checks if the path points to the file inside the web site
1098 *
1099 * @param string $filePath
1100 * @return bool
1101 */
1102 protected static function isAllowedLocalFile($filePath)
1103 {
1104 $filePath = GeneralUtility::resolveBackPath($filePath);
1105 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1106 $isFile = is_file($filePath);
1107 return $insideWebPath && $isFile;
1108 }
1109
1110 /******************************************
1111 *
1112 * Indexing; external files (PDF, DOC, etc)
1113 *
1114 ******************************************/
1115 /**
1116 * Indexing a regular document given as $file (relative to PATH_site, local file)
1117 *
1118 * @param string $file Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1119 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1120 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1121 * @param string $altExtension File extension for temporary file.
1122 * @return void
1123 */
1124 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1125 {
1126 // Init
1127 $fI = pathinfo($file);
1128 $ext = $altExtension ?: strtolower($fI['extension']);
1129 // Create abs-path:
1130 if (!$contentTmpFile) {
1131 if (!GeneralUtility::isAbsPath($file)) {
1132 // Relative, prepend PATH_site:
1133 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1134 } else {
1135 // Absolute, pass-through:
1136 $absFile = $file;
1137 }
1138 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1139 } else {
1140 $absFile = $contentTmpFile;
1141 }
1142 // Indexing the document:
1143 if ($absFile && @is_file($absFile)) {
1144 if ($this->external_parsers[$ext]) {
1145 $fileInfo = stat($absFile);
1146 $cParts = $this->fileContentParts($ext, $absFile);
1147 foreach ($cParts as $cPKey) {
1148 $this->internal_log = array();
1149 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1150 $Pstart = GeneralUtility::milliseconds();
1151 $subinfo = array('key' => $cPKey);
1152 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1153 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1154 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1155 if ($check > 0 || $force) {
1156 if ($check > 0) {
1157 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1158 } else {
1159 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1160 }
1161 // Check external file counter:
1162 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1163 // Divide into title,keywords,description and body:
1164 $this->log_push('Split content', '');
1165 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1166 $this->log_pull();
1167 if (is_array($contentParts)) {
1168 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1169 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1170 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1171 // Increment counter:
1172 $this->externalFileCounter++;
1173 // Splitting words
1174 $this->log_push('Extract words from content', '');
1175 $splitInWords = $this->processWordsInArrays($contentParts);
1176 $this->log_pull();
1177 // Analyse the indexed words.
1178 $this->log_push('Analyse the extracted words', '');
1179 $indexArr = $this->indexAnalyze($splitInWords);
1180 $this->log_pull();
1181 // Submitting page (phash) record
1182 $this->log_push('Submitting page', '');
1183 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1184 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1185 $this->log_pull();
1186 // Check words and submit to word list if not there
1187 $this->log_push('Check word list and submit words', '');
1188 if (IndexedSearchUtility::isTableUsed('index_words')) {
1189 $this->checkWordList($indexArr);
1190 $this->submitWords($indexArr, $phash_arr['phash']);
1191 }
1192 $this->log_pull();
1193 // Set parsetime
1194 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1195 } else {
1196 // Update the timestamp
1197 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1198 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1199 }
1200 } else {
1201 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1202 }
1203 } else {
1204 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1205 }
1206 } else {
1207 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1208 }
1209 // Checking and setting sections:
1210 $this->submitFile_section($phash_arr['phash']);
1211 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1212 $this->log_pull();
1213 }
1214 } else {
1215 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1216 }
1217 } else {
1218 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1219 }
1220 }
1221
1222 /**
1223 * Reads the content of an external file being indexed.
1224 * The content from the external parser MUST be returned in utf-8!
1225 *
1226 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1227 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1228 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1229 * @return array Standard content array (title, description, keywords, body keys)
1230 */
1231 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1232 {
1233 $contentArray = null;
1234 // Consult relevant external document parser:
1235 if (is_object($this->external_parsers[$fileExtension])) {
1236 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1237 }
1238 return $contentArray;
1239 }
1240
1241 /**
1242 * Creates an array with pointers to divisions of document.
1243 *
1244 * @param string $ext File extension
1245 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1246 * @return array Array of pointers to sections that the document should be divided into
1247 */
1248 public function fileContentParts($ext, $absFile)
1249 {
1250 $cParts = array(0);
1251 // Consult relevant external document parser:
1252 if (is_object($this->external_parsers[$ext])) {
1253 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1254 }
1255 return $cParts;
1256 }
1257
1258 /**
1259 * Splits non-HTML content (from external files for instance)
1260 *
1261 * @param string $content Input content (non-HTML) to index.
1262 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1263 * @see splitHTMLContent()
1264 */
1265 public function splitRegularContent($content)
1266 {
1267 $contentArr = $this->defaultContentArray;
1268 $contentArr['body'] = $content;
1269 return $contentArr;
1270 }
1271
1272 /**********************************
1273 *
1274 * Analysing content, Extracting words
1275 *
1276 **********************************/
1277 /**
1278 * Convert character set and HTML entities in the value of input content array keys
1279 *
1280 * @param array $contentArr Standard content array
1281 * @param string $charset Charset of the input content (converted to utf-8)
1282 * @return void
1283 */
1284 public function charsetEntity2utf8(&$contentArr, $charset)
1285 {
1286 // Convert charset if necessary
1287 foreach ($contentArr as $key => $value) {
1288 if ((string)$contentArr[$key] !== '') {
1289 if ($charset !== 'utf-8') {
1290 $contentArr[$key] = $this->csObj->conv($contentArr[$key], $charset, 'utf-8');
1291 }
1292 // decode all numeric / html-entities in the string to real characters:
1293 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key]);
1294 }
1295 }
1296 }
1297
1298 /**
1299 * Processing words in the array from split*Content -functions
1300 *
1301 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1302 * @return array Content input array modified so each key is not a unique array of words
1303 */
1304 public function processWordsInArrays($contentArr)
1305 {
1306 // split all parts to words
1307 foreach ($contentArr as $key => $value) {
1308 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1309 }
1310 // For title, keywords, and description we don't want duplicates:
1311 $contentArr['title'] = array_unique($contentArr['title']);
1312 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1313 $contentArr['description'] = array_unique($contentArr['description']);
1314 // Return modified array:
1315 return $contentArr;
1316 }
1317
1318 /**
1319 * Extracts the sample description text from the content array.
1320 *
1321 * @param array $contentArr Content array
1322 * @return string Description string
1323 */
1324 public function bodyDescription($contentArr)
1325 {
1326 // Setting description
1327 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1328 if ($maxL) {
1329 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1330 // Shorten the string:
1331 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1332 }
1333 return $bodyDescription;
1334 }
1335
1336 /**
1337 * Analyzes content to use for indexing,
1338 *
1339 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1340 * @return array Index Array (whatever that is...)
1341 */
1342 public function indexAnalyze($content)
1343 {
1344 $indexArr = array();
1345 $counter = 0;
1346 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1347 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1348 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1349 $this->analyzeBody($indexArr, $content);
1350 return $indexArr;
1351 }
1352
1353 /**
1354 * Calculates relevant information for headercontent
1355 *
1356 * @param array $retArr Index array, passed by reference
1357 * @param array $content Standard content array
1358 * @param string $key Key from standard content array
1359 * @param int $offset Bit-wise priority to type
1360 * @return void
1361 */
1362 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1363 {
1364 foreach ($content[$key] as $val) {
1365 $val = substr($val, 0, 60);
1366 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1367 if (!isset($retArr[$val])) {
1368 // Word ID (wid)
1369 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1370 // Metaphone value is also 60 only chars long
1371 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1372 $retArr[$val]['metaphone'] = $metaphone;
1373 }
1374 // Build metaphone fulltext string (can be used for fulltext indexing)
1375 if ($this->storeMetaphoneInfoAsWords) {
1376 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1377 }
1378 // Priority used for flagBitMask feature (see extension configuration)
1379 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1380 // Increase number of occurrences
1381 $retArr[$val]['count']++;
1382 $this->wordcount++;
1383 }
1384 }
1385
1386 /**
1387 * Calculates relevant information for bodycontent
1388 *
1389 * @param array $retArr Index array, passed by reference
1390 * @param array $content Standard content array
1391 * @return void
1392 */
1393 public function analyzeBody(&$retArr, $content)
1394 {
1395 foreach ($content['body'] as $key => $val) {
1396 $val = substr($val, 0, 60);
1397 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1398 if (!isset($retArr[$val])) {
1399 // First occurrence (used for ranking results)
1400 $retArr[$val]['first'] = $key;
1401 // Word ID (wid)
1402 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1403 // Metaphone value is also only 60 chars long
1404 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1405 $retArr[$val]['metaphone'] = $metaphone;
1406 }
1407 // Build metaphone fulltext string (can be used for fulltext indexing)
1408 if ($this->storeMetaphoneInfoAsWords) {
1409 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1410 }
1411 // Increase number of occurrences
1412 $retArr[$val]['count']++;
1413 $this->wordcount++;
1414 }
1415 }
1416
1417 /**
1418 * Creating metaphone based hash from input word
1419 *
1420 * @param string $word Word to convert
1421 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1422 * @return mixed Metaphone hash integer (or raw value, string)
1423 */
1424 public function metaphone($word, $returnRawMetaphoneValue = false)
1425 {
1426 if (is_object($this->metaphoneObj)) {
1427 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1428 } else {
1429 // Use native PHP function instead of advanced doubleMetaphone class
1430 $metaphoneRawValue = metaphone($word);
1431 }
1432 if ($returnRawMetaphoneValue) {
1433 $result = $metaphoneRawValue;
1434 } elseif ($metaphoneRawValue !== '') {
1435 // Create hash and return integer
1436 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1437 } else {
1438 $result = 0;
1439 }
1440 return $result;
1441 }
1442
1443 /********************************
1444 *
1445 * SQL; TYPO3 Pages
1446 *
1447 *******************************/
1448 /**
1449 * Updates db with information about the page (TYPO3 page, not external media)
1450 *
1451 * @return void
1452 */
1453 public function submitPage()
1454 {
1455 // Remove any current data for this phash:
1456 $this->removeOldIndexedPages($this->hash['phash']);
1457 // setting new phash_row
1458 $fields = array(
1459 'phash' => $this->hash['phash'],
1460 'phash_grouping' => $this->hash['phash_grouping'],
1461 'cHashParams' => serialize($this->cHashParams),
1462 'contentHash' => $this->content_md5h,
1463 'data_page_id' => $this->conf['id'],
1464 'data_page_reg1' => $this->conf['page_cache_reg1'],
1465 'data_page_type' => $this->conf['type'],
1466 'data_page_mp' => $this->conf['MP'],
1467 'gr_list' => $this->conf['gr_list'],
1468 'item_type' => 0,
1469 // TYPO3 page
1470 'item_title' => $this->contentParts['title'],
1471 'item_description' => $this->bodyDescription($this->contentParts),
1472 'item_mtime' => (int)$this->conf['mtime'],
1473 'item_size' => strlen($this->conf['content']),
1474 'tstamp' => $GLOBALS['EXEC_TIME'],
1475 'crdate' => $GLOBALS['EXEC_TIME'],
1476 'item_crdate' => $this->conf['crdate'],
1477 // Creation date of page
1478 'sys_language_uid' => $this->conf['sys_language_uid'],
1479 // Sys language uid of the page. Should reflect which language it DOES actually display!
1480 'externalUrl' => 0,
1481 'recordUid' => (int)$this->conf['recordUid'],
1482 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1483 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1484 );
1485 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1486 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1487 ->getConnectionForTable('index_phash');
1488 $connection->insert('index_phash', $fields);
1489 }
1490 // PROCESSING index_section
1491 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1492 // PROCESSING index_grlist
1493 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1494 // PROCESSING index_fulltext
1495 $fields = array(
1496 'phash' => $this->hash['phash'],
1497 'fulltextdata' => implode(' ', $this->contentParts),
1498 'metaphonedata' => $this->metaphoneContent
1499 );
1500 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1501 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1502 }
1503 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1504 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1505 ->getConnectionForTable('index_fulltext');
1506 $connection->insert('index_fulltext', $fields);
1507 }
1508 // PROCESSING index_debug
1509 if ($this->indexerConfig['debugMode']) {
1510 $fields = array(
1511 'phash' => $this->hash['phash'],
1512 'debuginfo' => serialize(array(
1513 'cHashParams' => $this->cHashParams,
1514 'external_parsers initialized' => array_keys($this->external_parsers),
1515 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1516 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1517 'logs' => $this->internal_log,
1518 'lexer' => $this->lexerObj->debugString
1519 ))
1520 );
1521 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1522 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1523 ->getConnectionForTable('index_debug');
1524 $connection->insert('index_debug', $fields);
1525 }
1526 }
1527 }
1528
1529 /**
1530 * Stores gr_list in the database.
1531 *
1532 * @param int $hash Search result record phash
1533 * @param int $phash_x Actual phash of current content
1534 * @return void
1535 * @see update_grlist()
1536 */
1537 public function submit_grlist($hash, $phash_x)
1538 {
1539 // Setting the gr_list record
1540 $fields = array(
1541 'phash' => $hash,
1542 'phash_x' => $phash_x,
1543 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1544 'gr_list' => $this->conf['gr_list']
1545 );
1546 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1547 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1548 ->getConnectionForTable('index_grlist');
1549 $connection->insert('index_grlist', $fields);
1550 }
1551 }
1552
1553 /**
1554 * Stores section
1555 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1556 *
1557 * @param int $hash phash of TYPO3 parent search result record
1558 * @param int $hash_t3 phash of the file indexation search record
1559 * @return void
1560 */
1561 public function submit_section($hash, $hash_t3)
1562 {
1563 $fields = array(
1564 'phash' => $hash,
1565 'phash_t3' => $hash_t3,
1566 'page_id' => (int)$this->conf['id']
1567 );
1568 $this->getRootLineFields($fields);
1569 if (IndexedSearchUtility::isTableUsed('index_section')) {
1570 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1571 ->getConnectionForTable('index_section');
1572 $connection->insert('index_section', $fields);
1573 }
1574 }
1575
1576 /**
1577 * Removes records for the indexed page, $phash
1578 *
1579 * @param int $phash phash value to flush
1580 * @return void
1581 */
1582 public function removeOldIndexedPages($phash)
1583 {
1584 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1585 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1586 foreach ($tableArray as $table) {
1587 if (IndexedSearchUtility::isTableUsed($table)) {
1588 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1589 ->getQueryBuilderForTable($table);
1590 $queryBuilder
1591 ->delete($table)
1592 ->where(
1593 $queryBuilder->expr()->eq('phash', (int)$phash)
1594 )
1595 ->execute();
1596 }
1597 }
1598 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1599 if (IndexedSearchUtility::isTableUsed('index_section')) {
1600 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1601 ->getQueryBuilderForTable('index_section');
1602 $queryBuilder
1603 ->delete('index_section')
1604 ->where(
1605 $queryBuilder->expr()->eq('phash_t3', (int)$phash)
1606 )
1607 ->execute();
1608 }
1609 }
1610
1611 /********************************
1612 *
1613 * SQL; External media
1614 *
1615 *******************************/
1616 /**
1617 * Updates db with information about the file
1618 *
1619 * @param array $hash Array with phash and phash_grouping keys for file
1620 * @param string $file File name
1621 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1622 * @param string $ext File extension determining the type of media.
1623 * @param int $mtime Modification time of file.
1624 * @param int $ctime Creation time of file.
1625 * @param int $size Size of file in bytes
1626 * @param int $content_md5h Content HASH value.
1627 * @param array $contentParts Standard content array (using only title and body for a file)
1628 * @return void
1629 */
1630 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1631 {
1632 // Find item Type:
1633 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1634 $storeItemType = $storeItemType ?: $ext;
1635 // Remove any current data for this phash:
1636 $this->removeOldIndexedFiles($hash['phash']);
1637 // Split filename:
1638 $fileParts = parse_url($file);
1639 // Setting new
1640 $fields = array(
1641 'phash' => $hash['phash'],
1642 'phash_grouping' => $hash['phash_grouping'],
1643 'cHashParams' => serialize($subinfo),
1644 'contentHash' => $content_md5h,
1645 'data_filename' => $file,
1646 'item_type' => $storeItemType,
1647 'item_title' => trim($contentParts['title']) ?: basename($file),
1648 'item_description' => $this->bodyDescription($contentParts),
1649 'item_mtime' => $mtime,
1650 'item_size' => $size,
1651 'item_crdate' => $ctime,
1652 'tstamp' => $GLOBALS['EXEC_TIME'],
1653 'crdate' => $GLOBALS['EXEC_TIME'],
1654 'gr_list' => $this->conf['gr_list'],
1655 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1656 'recordUid' => (int)$this->conf['recordUid'],
1657 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1658 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1659 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1660 );
1661 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1662 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1663 ->getConnectionForTable('index_phash');
1664 $connection->insert('index_phash', $fields);
1665 }
1666 // PROCESSING index_fulltext
1667 $fields = array(
1668 'phash' => $hash['phash'],
1669 'fulltextdata' => implode(' ', $contentParts),
1670 'metaphonedata' => $this->metaphoneContent
1671 );
1672 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1673 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1674 }
1675 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1676 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1677 ->getConnectionForTable('index_fulltext');
1678 $connection->insert('index_fulltext', $fields);
1679 }
1680 // PROCESSING index_debug
1681 if ($this->indexerConfig['debugMode']) {
1682 $fields = array(
1683 'phash' => $hash['phash'],
1684 'debuginfo' => serialize(array(
1685 'cHashParams' => $subinfo,
1686 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1687 'logs' => $this->internal_log,
1688 'lexer' => $this->lexerObj->debugString
1689 ))
1690 );
1691 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1692 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1693 ->getConnectionForTable('index_debug');
1694 $connection->insert('index_debug', $fields);
1695 }
1696 }
1697 }
1698
1699 /**
1700 * Stores file gr_list for a file IF it does not exist already
1701 *
1702 * @param int $hash phash value of file
1703 * @return void
1704 */
1705 public function submitFile_grlist($hash)
1706 {
1707 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1708 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1709 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$hash . ' AND (hash_gr_list=' . IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1710 if ($count == 0) {
1711 $this->submit_grlist($hash, $hash);
1712 }
1713 }
1714 }
1715
1716 /**
1717 * Stores file section for a file IF it does not exist
1718 *
1719 * @param int $hash phash value of file
1720 * @return void
1721 */
1722 public function submitFile_section($hash)
1723 {
1724 // Testing if there is already a section
1725 if (IndexedSearchUtility::isTableUsed('index_section')) {
1726 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . (int)$hash . ' AND page_id=' . (int)$this->conf['id']);
1727 if ($count == 0) {
1728 $this->submit_section($hash, $this->hash['phash']);
1729 }
1730 }
1731 }
1732
1733 /**
1734 * Removes records for the indexed page, $phash
1735 *
1736 * @param int $phash phash value to flush
1737 * @return void
1738 */
1739 public function removeOldIndexedFiles($phash)
1740 {
1741 // Removing old registrations for tables.
1742 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1743 foreach ($tableArray as $table) {
1744 if (IndexedSearchUtility::isTableUsed($table)) {
1745 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1746 ->getQueryBuilderForTable($table);
1747 $queryBuilder
1748 ->delete($table)
1749 ->where(
1750 $queryBuilder->expr()->eq('phash', (int)$phash)
1751 )
1752 ->execute();
1753 }
1754 }
1755 }
1756
1757 /********************************
1758 *
1759 * SQL Helper functions
1760 *
1761 *******************************/
1762 /**
1763 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1764 * Return positive integer if the page needs to be indexed
1765 *
1766 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1767 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1768 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1769 */
1770 public function checkMtimeTstamp($mtime, $phash)
1771 {
1772 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1773 // Not indexed (not in index_phash)
1774 $result = 4;
1775 } else {
1776 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . (int)$phash);
1777 // If there was an indexing of the page...:
1778 if ($row) {
1779 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1780 // If max age is exceeded, index the page
1781 // The configured max-age was exceeded for the document and thus it's indexed.
1782 $result = 1;
1783 } else {
1784 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1785 // if minAge is not set or if minAge is exceeded, consider at mtime
1786 if ($mtime) {
1787 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1788 if ($row['item_mtime'] != $mtime) {
1789 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1790 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1791 $result = 2;
1792 } else {
1793 // mtime matched the document, so no changes detected and no content updated
1794 $result = -1;
1795 if ($this->tstamp_maxAge) {
1796 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1797 } else {
1798 $this->updateTstamp($phash);
1799 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1800 }
1801 }
1802 } else {
1803 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1804 $result = 3;
1805 }
1806 } else {
1807 // The minimum age was not exceeded
1808 $result = -2;
1809 }
1810 }
1811 } else {
1812 // Page has never been indexed (is not represented in the index_phash table).
1813 $result = 4;
1814 }
1815 }
1816 return $result;
1817 }
1818
1819 /**
1820 * Check content hash in phash table
1821 *
1822 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1823 */
1824 public function checkContentHash()
1825 {
1826 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1827 $result = true;
1828 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1829 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . (int)$this->hash['phash_grouping'] . ' AND contentHash=' . (int)$this->content_md5h);
1830 if ($row) {
1831 $result = $row;
1832 }
1833 }
1834 return $result;
1835 }
1836
1837 /**
1838 * Check content hash for external documents
1839 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1840 *
1841 * @param int $hashGr phash value to check (phash_grouping)
1842 * @param int $content_md5h Content hash to check
1843 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1844 */
1845 public function checkExternalDocContentHash($hashGr, $content_md5h)
1846 {
1847 $result = true;
1848 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1849 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . (int)$hashGr . ' AND contentHash=' . (int)$content_md5h);
1850 $result = $count == 0;
1851 }
1852 return $result;
1853 }
1854
1855 /**
1856 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1857 *
1858 * @param int $phash_x Phash integer to test.
1859 * @return bool
1860 */
1861 public function is_grlist_set($phash_x)
1862 {
1863 $result = false;
1864 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1865 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . (int)$phash_x);
1866 $result = $count > 0;
1867 }
1868 return $result;
1869 }
1870
1871 /**
1872 * Check if an grlist-entry for this hash exists and if not so, write one.
1873 *
1874 * @param int $phash phash of the search result that should be found
1875 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1876 * @return void
1877 * @see submit_grlist()
1878 */
1879 public function update_grlist($phash, $phash_x)
1880 {
1881 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1882 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$phash . ' AND hash_gr_list=' . IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1883 if ($count == 0) {
1884 $this->submit_grlist($phash, $phash_x);
1885 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1886 }
1887 }
1888 }
1889
1890 /**
1891 * Update tstamp for a phash row.
1892 *
1893 * @param int $phash phash value
1894 * @param int $mtime If set, update the mtime field to this value.
1895 * @return void
1896 */
1897 public function updateTstamp($phash, $mtime = 0)
1898 {
1899 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1900 $updateFields = array(
1901 'tstamp' => $GLOBALS['EXEC_TIME']
1902 );
1903 if ($mtime) {
1904 $updateFields['item_mtime'] = (int)$mtime;
1905 }
1906 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1907 }
1908 }
1909
1910 /**
1911 * Update SetID of the index_phash record.
1912 *
1913 * @param int $phash phash value
1914 * @return void
1915 */
1916 public function updateSetId($phash)
1917 {
1918 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1919 $updateFields = array(
1920 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1921 );
1922 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1923 }
1924 }
1925
1926 /**
1927 * Update parsetime for phash row.
1928 *
1929 * @param int $phash phash value.
1930 * @param int $parsetime Parsetime value to set.
1931 * @return void
1932 */
1933 public function updateParsetime($phash, $parsetime)
1934 {
1935 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1936 $updateFields = array(
1937 'parsetime' => (int)$parsetime
1938 );
1939 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1940 }
1941 }
1942
1943 /**
1944 * Update section rootline for the page
1945 *
1946 * @return void
1947 */
1948 public function updateRootline()
1949 {
1950 if (IndexedSearchUtility::isTableUsed('index_section')) {
1951 $updateFields = array();
1952 $this->getRootLineFields($updateFields);
1953 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . (int)$this->conf['id'], $updateFields);
1954 }
1955 }
1956
1957 /**
1958 * Adding values for root-line fields.
1959 * rl0, rl1 and rl2 are standard. A hook might add more.
1960 *
1961 * @param array $fieldArray Field array, passed by reference
1962 * @return void
1963 */
1964 public function getRootLineFields(array &$fieldArray)
1965 {
1966 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1967 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1968 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1969 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1970 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1971 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1972 }
1973 }
1974 }
1975
1976 /**
1977 * Includes the crawler class
1978 *
1979 * @return void
1980 */
1981 public function includeCrawlerClass()
1982 {
1983 require_once \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php';
1984 }
1985
1986 /********************************
1987 *
1988 * SQL; Submitting words
1989 *
1990 *******************************/
1991 /**
1992 * Adds new words to db
1993 *
1994 * @param array $wordListArray Word List array (where each word has information about position etc).
1995 * @return void
1996 */
1997 public function checkWordList($wordListArray)
1998 {
1999 if (IndexedSearchUtility::isTableUsed('index_words')) {
2000 if (!empty($wordListArray)) {
2001 $phashArray = array();
2002 foreach ($wordListArray as $value) {
2003 $phashArray[] = (int)$value['hash'];
2004 }
2005 $cwl = implode(',', $phashArray);
2006 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
2007 $wordListArrayCount = count($wordListArray);
2008 if ($count !== $wordListArrayCount) {
2009 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
2010 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2011 while (false != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
2012 unset($wordListArray[$row['baseword']]);
2013 }
2014 $GLOBALS['TYPO3_DB']->sql_free_result($res);
2015 foreach ($wordListArray as $key => $val) {
2016 $insertFields = array(
2017 'wid' => $val['hash'],
2018 'baseword' => $key,
2019 'metaphone' => $val['metaphone']
2020 );
2021 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
2022 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
2023 ->getConnectionForTable('index_words');
2024 $connection->insert('index_words', $insertFields);
2025 }
2026 }
2027 }
2028 }
2029 }
2030
2031 /**
2032 * Submits RELATIONS between words and phash
2033 *
2034 * @param array $wordList Word list array
2035 * @param int $phash phash value
2036 * @return void
2037 */
2038 public function submitWords($wordList, $phash)
2039 {
2040 if (IndexedSearchUtility::isTableUsed('index_rel')) {
2041 $stopWords = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('wid', 'index_words', 'is_stopword != 0', '', '', '', 'wid');
2042
2043 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
2044 ->getQueryBuilderForTable('index_rel');
2045 $queryBuilder
2046 ->delete('index_rel')
2047 ->where(
2048 $queryBuilder->expr()->eq('phash', (int)$phash)
2049 )
2050 ->execute();
2051 $fields = array('phash', 'wid', 'count', 'first', 'freq', 'flags');
2052 $rows = array();
2053 foreach ($wordList as $val) {
2054 if (isset($stopWords[$val['hash']])) {
2055 continue;
2056 }
2057 $rows[] = array(
2058 (int)$phash,
2059 (int)$val['hash'],
2060 (int)$val['count'],
2061 (int)$val['first'],
2062 $this->freqMap($val['count'] / $this->wordcount),
2063 $val['cmp'] & $this->flagBitMask
2064 );
2065 }
2066 GeneralUtility::makeInstance(ConnectionPool::class)
2067 ->getConnectionForTable('index_rel')
2068 ->bulkInsert('index_rel', $rows, $fields);
2069 }
2070 }
2071
2072 /**
2073 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2074 * and back.
2075 *
2076 * @param float $freq Frequency
2077 * @return int Frequency in range.
2078 */
2079 public function freqMap($freq)
2080 {
2081 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2082 if ($freq <= 1) {
2083 $newFreq = $freq * $mapFactor;
2084 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2085 } else {
2086 $newFreq = $freq / $mapFactor;
2087 }
2088 return $newFreq;
2089 }
2090
2091 /********************************
2092 *
2093 * Hashing
2094 *
2095 *******************************/
2096 /**
2097 * Get search hash, T3 pages
2098 *
2099 * @return void
2100 */
2101 public function setT3Hashes()
2102 {
2103 // Set main array:
2104 $hArray = array(
2105 'id' => (int)$this->conf['id'],
2106 'type' => (int)$this->conf['type'],
2107 'sys_lang' => (int)$this->conf['sys_language_uid'],
2108 'MP' => (string)$this->conf['MP'],
2109 'cHash' => $this->cHashParams
2110 );
2111 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2112 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2113 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2114 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2115 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2116 }
2117
2118 /**
2119 * Get search hash, external files
2120 *
2121 * @param string $file File name / path which identifies it on the server
2122 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2123 * @return array Array with "phash_grouping" and "phash" inside.
2124 */
2125 public function setExtHashes($file, $subinfo = array())
2126 {
2127 // Set main array:
2128 $hash = array();
2129 $hArray = array(
2130 'file' => $file
2131 );
2132 // Set grouping hash:
2133 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2134 // Add subinfo
2135 $hArray['subinfo'] = $subinfo;
2136 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2137 return $hash;
2138 }
2139
2140 /*********************************
2141 *
2142 * Internal logging functions
2143 *
2144 *********************************/
2145 /**
2146 * Push function wrapper for TT logging
2147 *
2148 * @param string $msg Title to set
2149 * @param string $key Key (?)
2150 * @return void
2151 */
2152 public function log_push($msg, $key)
2153 {
2154 $this->timeTracker->push($msg, $key);
2155 }
2156
2157 /**
2158 * Pull function wrapper for TT logging
2159 *
2160 * @return void
2161 */
2162 public function log_pull()
2163 {
2164 $this->timeTracker->pull();
2165 }
2166
2167 /**
2168 * Set log message function wrapper for TT logging
2169 *
2170 * @param string $msg Message to set
2171 * @param int $errorNum Error number
2172 * @return void
2173 */
2174 public function log_setTSlogMessage($msg, $errorNum = 0)
2175 {
2176 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2177 $this->internal_log[] = $msg;
2178 }
2179
2180 /**
2181 * Makes sure that keywords are space-separated. This is impotant for their
2182 * proper displaying as a part of fulltext index.
2183 *
2184 * @param string $keywordList
2185 * @return string
2186 * @see http://forge.typo3.org/issues/14959
2187 */
2188 protected function addSpacesToKeywordList($keywordList)
2189 {
2190 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2191 return ' ' . implode(', ', $keywords) . ' ';
2192 }
2193 }