[!!!][TASK] Make TimeTracker a singleton
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
18 use TYPO3\CMS\Core\Utility\GeneralUtility;
19 use TYPO3\CMS\Core\Utility\MathUtility;
20 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
21 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
22
23 /**
24 * Indexing class for TYPO3 frontend
25 */
26 class Indexer
27 {
28 /**
29 * @var array
30 */
31 public $reasons = array(
32 -1 => 'mtime matched the document, so no changes detected and no content updated',
33 -2 => 'The minimum age was not exceeded',
34 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
35 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
36 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
37 4 => 'Page has never been indexed (is not represented in the index_phash table).'
38 );
39
40 /**
41 * HTML code blocks to exclude from indexing
42 *
43 * @var string
44 */
45 public $excludeSections = 'script,style';
46
47 /**
48 * Supported Extensions for external files
49 *
50 * @var array
51 */
52 public $external_parsers = array();
53
54 /**
55 * External parser objects, keys are file extension names. Values are objects with certain methods.
56 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
57 * in access limited pages!)
58 *
59 * @var string
60 */
61 public $defaultGrList = '0,-1';
62
63 /**
64 * Min/Max times
65 *
66 * @var int
67 */
68 public $tstamp_maxAge = 0;
69
70 /**
71 * If set, this tells a number of seconds that is the maximum age of an indexed document.
72 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
73 *
74 * @var int
75 */
76 public $tstamp_minAge = 0;
77
78 /**
79 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
80 *
81 * @var int
82 */
83 public $maxExternalFiles = 0;
84
85 /**
86 * Max number of external files to index.
87 *
88 * @var bool
89 */
90 public $forceIndexing = false;
91
92 /**
93 * If TRUE, indexing is forced despite of hashes etc.
94 *
95 * @var bool
96 */
97 public $crawlerActive = false;
98
99 /**
100 * Set when crawler is detected (internal)
101 *
102 * @var array
103 */
104 public $defaultContentArray = array(
105 'title' => '',
106 'description' => '',
107 'keywords' => '',
108 'body' => ''
109 );
110
111 /**
112 * @var int
113 */
114 public $wordcount = 0;
115
116 /**
117 * @var int
118 */
119 public $externalFileCounter = 0;
120
121 /**
122 * @var array
123 */
124 public $conf = array();
125
126 /**
127 * Configuration set internally (see init functions for required keys and their meaning)
128 *
129 * @var array
130 */
131 public $indexerConfig = array();
132
133 /**
134 * Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
135 *
136 * @var array
137 */
138 public $hash = array();
139
140 /**
141 * Hash array, contains phash and phash_grouping
142 *
143 * @var array
144 */
145 public $file_phash_arr = array();
146
147 /**
148 * Hash array for files
149 *
150 * @var array
151 */
152 public $contentParts = array();
153
154 /**
155 * Content of TYPO3 page
156 *
157 * @var string
158 */
159 public $content_md5h = '';
160
161 /**
162 * @var array
163 */
164 public $internal_log = array();
165
166 /**
167 * Internal log
168 *
169 * @var string
170 */
171 public $indexExternalUrl_content = '';
172
173 /**
174 * @var array
175 */
176 public $cHashParams = array();
177
178 /**
179 * cHashparams array
180 *
181 * @var int
182 */
183 public $freqRange = 32000;
184
185 /**
186 * @var float
187 */
188 public $freqMax = 0.1;
189
190 /**
191 * @var bool
192 */
193 public $enableMetaphoneSearch = false;
194
195 /**
196 * @var bool
197 */
198 public $storeMetaphoneInfoAsWords;
199
200 /**
201 * @var string
202 */
203 public $metaphoneContent = '';
204
205 /**
206 * Charset class object
207 *
208 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
209 */
210 public $csObj;
211
212 /**
213 * Metaphone object, if any
214 *
215 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
216 */
217 public $metaphoneObj;
218
219 /**
220 * Lexer object for word splitting
221 *
222 * @var \TYPO3\CMS\IndexedSearch\Lexer
223 */
224 public $lexerObj;
225
226 /**
227 * @var bool
228 */
229 public $flagBitMask;
230
231 /**
232 * @var TimeTracker
233 */
234 protected $timeTracker;
235
236 /**
237 * Indexer constructor.
238 */
239 public function __construct()
240 {
241 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
242 }
243
244 /**
245 * Parent Object (TSFE) Initialization
246 *
247 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
248 * @return void
249 */
250 public function hook_indexContent(&$pObj)
251 {
252 // Indexer configuration from Extension Manager interface:
253 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
254 // Crawler activation:
255 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
256 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
257 // Setting simple log message:
258 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
259 // Setting variables:
260 $this->crawlerActive = true;
261 // Crawler active flag
262 $this->forceIndexing = true;
263 }
264 // Determine if page should be indexed, and if so, configure and initialize indexer
265 if ($pObj->config['config']['index_enable']) {
266 $this->log_push('Index page', '');
267 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
268 if (!$pObj->page['no_search']) {
269 if (!$pObj->no_cache) {
270 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
271 // Setting up internal configuration from config array:
272 $this->conf = array();
273 // Information about page for which the indexing takes place
274 $this->conf['id'] = $pObj->id;
275 // Page id
276 $this->conf['type'] = $pObj->type;
277 // Page type
278 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
279 // sys_language UID of the language of the indexing.
280 $this->conf['MP'] = $pObj->MP;
281 // MP variable, if any (Mount Points)
282 $this->conf['gr_list'] = $pObj->gr_list;
283 // Group list
284 $this->conf['cHash'] = $pObj->cHash;
285 // cHash string for additional parameters
286 $this->conf['cHash_array'] = $pObj->cHash_array;
287 // Array of the additional parameters
288 $this->conf['crdate'] = $pObj->page['crdate'];
289 // The creation date of the TYPO3 page
290 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
291 // reg1 of the caching table. Not known what practical use this has.
292 // Root line uids
293 $this->conf['rootline_uids'] = array();
294 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
295 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
296 }
297 // Content of page:
298 $this->conf['content'] = $pObj->content;
299 // Content string (HTML of TYPO3 page)
300 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
301 // Alternative title for indexing
302 $this->conf['metaCharset'] = $pObj->metaCharset;
303 // Character set of content (will be converted to utf-8 during indexing)
304 $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
305 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
306 // Configuration of behavior:
307 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
308 // Whether to index external documents like PDF, DOC etc. (if possible)
309 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
310 // Length of description text (max 250, default 200)
311 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
312 // Set to zero:
313 $this->conf['recordUid'] = 0;
314 $this->conf['freeIndexUid'] = 0;
315 $this->conf['freeIndexSetId'] = 0;
316 // Init and start indexing:
317 $this->init();
318 $this->indexTypo3PageContent();
319 } else {
320 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
321 }
322 } else {
323 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
324 }
325 } else {
326 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
327 }
328 } else {
329 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
330 }
331 $this->log_pull();
332 }
333 }
334
335 /****************************
336 *
337 * Backend API
338 *
339 ****************************/
340 /**
341 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
342 *
343 * @param int $id The page uid, &id=
344 * @param int $type The page type, &type=
345 * @param int $sys_language_uid sys_language uid, typically &L=
346 * @param string $MP The MP variable (Mount Points), &MP=
347 * @param array $uidRL Rootline array of only UIDs.
348 * @param array $cHash_array Array of GET variables to register with this indexing
349 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
350 * @return void
351 */
352 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = false)
353 {
354 // Setting up internal configuration from config array:
355 $this->conf = array();
356 // Information about page for which the indexing takes place
357 $this->conf['id'] = $id;
358 // Page id (int)
359 $this->conf['type'] = $type;
360 // Page type (int)
361 $this->conf['sys_language_uid'] = $sys_language_uid;
362 // sys_language UID of the language of the indexing (int)
363 $this->conf['MP'] = $MP;
364 // MP variable, if any (Mount Points) (string)
365 $this->conf['gr_list'] = '0,-1';
366 // Group list (hardcoded for now...)
367 // cHash values:
368 if ($createCHash) {
369 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
370 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
371 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
372 } else {
373 $this->conf['cHash'] = '';
374 }
375 // cHash string for additional parameters
376 $this->conf['cHash_array'] = $cHash_array;
377 // Array of the additional parameters
378 // Set to defaults
379 $this->conf['freeIndexUid'] = 0;
380 $this->conf['freeIndexSetId'] = 0;
381 $this->conf['page_cache_reg1'] = '';
382 // Root line uids
383 $this->conf['rootline_uids'] = $uidRL;
384 // Configuration of behavior:
385 $this->conf['index_externals'] = 1;
386 // Whether to index external documents like PDF, DOC etc. (if possible)
387 $this->conf['index_descrLgd'] = 200;
388 // Length of description text (max 250, default 200)
389 $this->conf['index_metatags'] = true;
390 // Whether to index document keywords and description (if present)
391 // Init and start indexing:
392 $this->init();
393 }
394
395 /**
396 * Sets the free-index uid. Can be called right after backend_initIndexer()
397 *
398 * @param int $freeIndexUid Free index UID
399 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
400 * @return void
401 */
402 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
403 {
404 $this->conf['freeIndexUid'] = $freeIndexUid;
405 $this->conf['freeIndexSetId'] = $freeIndexSetId;
406 }
407
408 /**
409 * Indexing records as the content of a TYPO3 page.
410 *
411 * @param string $title Title equivalent
412 * @param string $keywords Keywords equivalent
413 * @param string $description Description equivalent
414 * @param string $content The main content to index
415 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
416 * @param int $mtime Last modification time, in seconds
417 * @param int $crdate The creation date of the content, in seconds
418 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
419 * @return void
420 */
421 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
422 {
423 // Content of page:
424 $this->conf['mtime'] = $mtime;
425 // Most recent modification time (seconds) of the content
426 $this->conf['crdate'] = $crdate;
427 // The creation date of the TYPO3 content
428 $this->conf['recordUid'] = $recordUid;
429 // UID of the record, if applicable
430 // Construct fake HTML for parsing:
431 $this->conf['content'] = '
432 <html>
433 <head>
434 <title>' . htmlspecialchars($title) . '</title>
435 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
436 <meta name="description" content="' . htmlspecialchars($description) . '" />
437 </head>
438 <body>
439 ' . htmlspecialchars($content) . '
440 </body>
441 </html>';
442 // Content string (HTML of TYPO3 page)
443 // Initializing charset:
444 $this->conf['metaCharset'] = $charset;
445 // Character set of content (will be converted to utf-8 during indexing)
446 $this->conf['indexedDocTitle'] = '';
447 // Alternative title for indexing
448 // Index content as if it was a TYPO3 page:
449 $this->indexTypo3PageContent();
450 }
451
452 /********************************
453 *
454 * Initialization
455 *
456 *******************************/
457 /**
458 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
459 *
460 * @return void
461 */
462 public function init()
463 {
464 // Initializing:
465 $this->cHashParams = $this->conf['cHash_array'];
466 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
467 if ($this->conf['cHash']) {
468 // Add this so that URL's come out right...
469 $this->cHashParams['cHash'] = $this->conf['cHash'];
470 }
471 unset($this->cHashParams['encryptionKey']);
472 }
473 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
474 $this->setT3Hashes();
475 // Indexer configuration from Extension Manager interface:
476 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
477 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
478 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
479 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
480 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
481 // Workaround: If the extension configuration was not updated yet, the value is not existing
482 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
483 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
484 // Initialize external document parsers:
485 // Example configuration, see ext_localconf.php of this file!
486 if ($this->conf['index_externals']) {
487 $this->initializeExternalParsers();
488 }
489 // Initialize lexer (class that deconstructs the text into words):
490 $lexerObjRef = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
491 $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
492 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
493 // Initialize metaphone hook:
494 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
495 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
496 $this->metaphoneObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
497 $this->metaphoneObj->pObj = $this;
498 }
499 // Init charset class:
500 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
501 }
502
503 /**
504 * Initialize external parsers
505 *
506 * @return void
507 * @access private
508 * @see init()
509 */
510 public function initializeExternalParsers()
511 {
512 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'])) {
513 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
514 $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
515 $this->external_parsers[$extension]->pObj = $this;
516 // Init parser and if it returns FALSE, unset its entry again:
517 if (!$this->external_parsers[$extension]->initParser($extension)) {
518 unset($this->external_parsers[$extension]);
519 }
520 }
521 }
522 }
523
524 /********************************
525 *
526 * Indexing; TYPO3 pages (HTML content)
527 *
528 *******************************/
529 /**
530 * Start indexing of the TYPO3 page
531 *
532 * @return void
533 */
534 public function indexTypo3PageContent()
535 {
536 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
537 $is_grlist = $this->is_grlist_set($this->hash['phash']);
538 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
539 // Setting message:
540 if ($this->forceIndexing) {
541 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
542 } elseif ($check > 0) {
543 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
544 } else {
545 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
546 }
547 // Divide into title,keywords,description and body:
548 $this->log_push('Split content', '');
549 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
550 if ($this->conf['indexedDocTitle']) {
551 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
552 }
553 $this->log_pull();
554 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
555 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
556 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
557 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
558 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
559 $checkCHash = $this->checkContentHash();
560 if (!is_array($checkCHash) || $check === 1) {
561 $Pstart = GeneralUtility::milliseconds();
562 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
563 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
564 $this->log_pull();
565 // Splitting words
566 $this->log_push('Extract words from content', '');
567 $splitInWords = $this->processWordsInArrays($this->contentParts);
568 $this->log_pull();
569 // Analyse the indexed words.
570 $this->log_push('Analyse the extracted words', '');
571 $indexArr = $this->indexAnalyze($splitInWords);
572 $this->log_pull();
573 // Submitting page (phash) record
574 $this->log_push('Submitting page', '');
575 $this->submitPage();
576 $this->log_pull();
577 // Check words and submit to word list if not there
578 $this->log_push('Check word list and submit words', '');
579 if (IndexedSearchUtility::isTableUsed('index_words')) {
580 $this->checkWordList($indexArr);
581 $this->submitWords($indexArr, $this->hash['phash']);
582 }
583 $this->log_pull();
584 // Set parsetime
585 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
586 // Checking external files if configured for.
587 $this->log_push('Checking external files', '');
588 if ($this->conf['index_externals']) {
589 $this->extractLinks($this->conf['content']);
590 }
591 $this->log_pull();
592 } else {
593 // Update the timestamp
594 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
595 $this->updateSetId($this->hash['phash']);
596 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
597 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
598 $this->updateRootline();
599 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
600 }
601 } else {
602 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
603 }
604 }
605
606 /**
607 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
608 *
609 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
610 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
611 * @see splitRegularContent()
612 */
613 public function splitHTMLContent($content)
614 {
615 // divide head from body ( u-ouh :) )
616 $contentArr = $this->defaultContentArray;
617 $contentArr['body'] = stristr($content, '<body');
618 $headPart = substr($content, 0, -strlen($contentArr['body']));
619 // get title
620 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
621 $titleParts = explode(':', $contentArr['title'], 2);
622 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
623 // get keywords and description metatags
624 if ($this->conf['index_metatags']) {
625 $meta = array();
626 $i = 0;
627 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
628 $i++;
629 }
630 // @todo The code below stops at first unset tag. Is that correct?
631 for ($i = 0; isset($meta[$i]); $i++) {
632 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
633 if (stristr($meta[$i]['name'], 'keywords')) {
634 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
635 }
636 if (stristr($meta[$i]['name'], 'description')) {
637 $contentArr['description'] .= ',' . $meta[$i]['content'];
638 }
639 }
640 }
641 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
642 $this->typoSearchTags($contentArr['body']);
643 // Get rid of unwanted sections (ie. scripting and style stuff) in body
644 $tagList = explode(',', $this->excludeSections);
645 foreach ($tagList as $tag) {
646 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
647 }
648 }
649 // remove tags, but first make sure we don't concatenate words by doing it
650 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
651 $contentArr['body'] = trim(strip_tags($contentArr['body']));
652 $contentArr['keywords'] = trim($contentArr['keywords']);
653 $contentArr['description'] = trim($contentArr['description']);
654 // Return array
655 return $contentArr;
656 }
657
658 /**
659 * Extract the charset value from HTML meta tag.
660 *
661 * @param string $content HTML content
662 * @return string The charset value if found.
663 */
664 public function getHTMLcharset($content)
665 {
666 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
667 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
668 return $reg2[1];
669 }
670 }
671 }
672
673 /**
674 * Converts a HTML document to utf-8
675 *
676 * @param string $content HTML content, any charset
677 * @param string $charset Optional charset (otherwise extracted from HTML)
678 * @return string Converted HTML
679 */
680 public function convertHTMLToUtf8($content, $charset = '')
681 {
682 // Find charset:
683 $charset = $charset ?: $this->getHTMLcharset($content);
684 $charset = $this->csObj->parse_charset($charset);
685 // Convert charset:
686 if ($charset && $charset !== 'utf-8') {
687 $content = $this->csObj->conv($content, $charset, 'utf-8');
688 }
689 // Convert entities, assuming document is now UTF-8:
690 return $this->csObj->entities_to_utf8($content, true);
691 }
692
693 /**
694 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
695 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
696 * <title> of document or removing <script>-sections
697 *
698 * @param string $string String to search in
699 * @param string $tagName Tag name, eg. "script
700 * @param string $tagContent Passed by reference: Content inside found tag
701 * @param string $stringAfter Passed by reference: Content after found tag
702 * @param string $paramList Passed by reference: Attributes of the found tag.
703 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
704 */
705 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
706 {
707 $endTag = '</' . $tagName . '>';
708 $startTag = '<' . $tagName;
709 // stristr used because we want a case-insensitive search for the tag.
710 $isTagInText = stristr($string, $startTag);
711 // if the tag was not found, return FALSE
712 if (!$isTagInText) {
713 return false;
714 }
715 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
716 $afterTagInText = stristr($isTagInText, $endTag);
717 if ($afterTagInText) {
718 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
719 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
720 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
721 } else {
722 $tagContent = '';
723 $stringAfter = $isTagInText;
724 }
725 return true;
726 }
727
728 /**
729 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
730 *
731 * @param string $body HTML Content, passed by reference
732 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
733 */
734 public function typoSearchTags(&$body)
735 {
736 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
737 if (count($expBody) > 1) {
738 $body = '';
739 foreach ($expBody as $val) {
740 $part = explode('-->', $val, 2);
741 if (trim($part[0]) == 'begin') {
742 $body .= $part[1];
743 $prev = '';
744 } elseif (trim($part[0]) == 'end') {
745 $body .= $prev;
746 } else {
747 $prev = $val;
748 }
749 }
750 return true;
751 } else {
752 return false;
753 }
754 }
755
756 /**
757 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
758 *
759 * @param string $content HTML content
760 * @return void
761 */
762 public function extractLinks($content)
763 {
764 // Get links:
765 $list = $this->extractHyperLinks($content);
766 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
767 $this->includeCrawlerClass();
768 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
769 }
770 // Traverse links:
771 foreach ($list as $linkInfo) {
772 // Decode entities:
773 if ($linkInfo['localPath']) {
774 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
775 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
776 } else {
777 $linkSource = htmlspecialchars_decode($linkInfo['href']);
778 }
779 // Parse URL:
780 $qParts = parse_url($linkSource);
781 // Check for jumpurl (TYPO3 specific thing...)
782 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
783 parse_str($qParts['query'], $getP);
784 $linkSource = $getP['jumpurl'];
785 $qParts = parse_url($linkSource);
786 }
787 if (!$linkInfo['localPath'] && $qParts['scheme']) {
788 if ($this->indexerConfig['indexExternalURLs']) {
789 // Index external URL (http or otherwise)
790 $this->indexExternalUrl($linkSource);
791 }
792 } elseif (!$qParts['query']) {
793 $linkSource = urldecode($linkSource);
794 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
795 $localFile = $linkSource;
796 } else {
797 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
798 }
799 if ($localFile && @is_file($localFile)) {
800 // Index local file:
801 if ($linkInfo['localPath']) {
802 $fI = pathinfo($linkSource);
803 $ext = strtolower($fI['extension']);
804 if (is_object($crawler)) {
805 $params = array(
806 'document' => $linkSource,
807 'alturl' => $linkInfo['href'],
808 'conf' => $this->conf
809 );
810 unset($params['conf']['content']);
811 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
812 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
813 } else {
814 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
815 }
816 } else {
817 if (is_object($crawler)) {
818 $params = array(
819 'document' => $linkSource,
820 'conf' => $this->conf
821 );
822 unset($params['conf']['content']);
823 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
824 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
825 } else {
826 $this->indexRegularDocument($linkSource);
827 }
828 }
829 }
830 }
831 }
832 }
833
834 /**
835 * Extracts all links to external documents from the HTML content string
836 *
837 * @param string $html
838 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
839 * @see extractLinks()
840 */
841 public function extractHyperLinks($html)
842 {
843 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
844 $htmlParts = $htmlParser->splitTags('a', $html);
845 $hyperLinksData = array();
846 foreach ($htmlParts as $index => $tagData) {
847 if ($index % 2 !== 0) {
848 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
849 $firstTagName = $htmlParser->getFirstTagName($tagData);
850 if (strtolower($firstTagName) === 'a') {
851 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
852 $hyperLinksData[] = array(
853 'tag' => $tagData,
854 'href' => $tagAttributes[0]['href'],
855 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
856 );
857 }
858 }
859 }
860 }
861 return $hyperLinksData;
862 }
863
864 /**
865 * Extracts the "base href" from content string.
866 *
867 * @param string $html Content to analyze
868 * @return string The base href or an empty string if not found
869 */
870 public function extractBaseHref($html)
871 {
872 $href = '';
873 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
874 $htmlParts = $htmlParser->splitTags('base', $html);
875 foreach ($htmlParts as $index => $tagData) {
876 if ($index % 2 !== 0) {
877 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
878 $firstTagName = $htmlParser->getFirstTagName($tagData);
879 if (strtolower($firstTagName) === 'base') {
880 $href = $tagAttributes[0]['href'];
881 if ($href) {
882 break;
883 }
884 }
885 }
886 }
887 return $href;
888 }
889
890 /******************************************
891 *
892 * Indexing; external URL
893 *
894 ******************************************/
895 /**
896 * Index External URLs HTML content
897 *
898 * @param string $externalUrl URL, eg. "http://typo3.org/
899 * @return void
900 * @see indexRegularDocument()
901 */
902 public function indexExternalUrl($externalUrl)
903 {
904 // Parse External URL:
905 $qParts = parse_url($externalUrl);
906 $fI = pathinfo($qParts['path']);
907 $ext = strtolower($fI['extension']);
908 // Get headers:
909 $urlHeaders = $this->getUrlHeaders($externalUrl);
910 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
911 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
912 if ((string)$content !== '') {
913 // Create temporary file:
914 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
915 if ($tmpFile) {
916 GeneralUtility::writeFile($tmpFile, $content);
917 // Index that file:
918 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
919 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
920 unlink($tmpFile);
921 }
922 }
923 }
924 }
925
926 /**
927 * Getting HTTP request headers of URL
928 *
929 * @param string $url The URL
930 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
931 */
932 public function getUrlHeaders($url)
933 {
934 // Try to get the headers only
935 $content = GeneralUtility::getUrl($url, 2);
936 if ((string)$content !== '') {
937 // Compile headers:
938 $headers = GeneralUtility::trimExplode(LF, $content, true);
939 $retVal = array();
940 foreach ($headers as $line) {
941 if (trim($line) === '') {
942 break;
943 }
944 list($headKey, $headValue) = explode(':', $line, 2);
945 $retVal[$headKey] = $headValue;
946 }
947 return $retVal;
948 }
949 }
950
951 /**
952 * Checks if the file is local
953 *
954 * @param string $sourcePath
955 * @return string Absolute path to file if file is local, else empty string
956 */
957 protected function createLocalPath($sourcePath)
958 {
959 $localPath = '';
960 static $pathFunctions = array(
961 'createLocalPathFromT3vars',
962 'createLocalPathUsingAbsRefPrefix',
963 'createLocalPathUsingDomainURL',
964 'createLocalPathFromAbsoluteURL',
965 'createLocalPathFromRelativeURL'
966 );
967 foreach ($pathFunctions as $functionName) {
968 $localPath = $this->{$functionName}($sourcePath);
969 if ($localPath != '') {
970 break;
971 }
972 }
973 return $localPath;
974 }
975
976 /**
977 * Attempts to create a local file path from T3VARs. This is useful for
978 * various download extensions that hide actual file name but still want the
979 * file to be indexed.
980 *
981 * @param string $sourcePath
982 * @return string
983 */
984 protected function createLocalPathFromT3vars($sourcePath)
985 {
986 $localPath = '';
987 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
988 if (is_array($indexLocalFiles)) {
989 $md5 = GeneralUtility::shortMD5($sourcePath);
990 // Note: not using self::isAllowedLocalFile here because this method
991 // is allowed to index files outside of the web site (for example,
992 // protected downloads)
993 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
994 $localPath = $indexLocalFiles[$md5];
995 }
996 }
997 return $localPath;
998 }
999
1000 /**
1001 * Attempts to create a local file path by matching a current request URL.
1002 *
1003 * @param string $sourcePath
1004 * @return string
1005 */
1006 protected function createLocalPathUsingDomainURL($sourcePath)
1007 {
1008 $localPath = '';
1009 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1010 $baseURLLength = strlen($baseURL);
1011 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1012 $sourcePath = substr($sourcePath, $baseURLLength);
1013 $localPath = PATH_site . $sourcePath;
1014 if (!self::isAllowedLocalFile($localPath)) {
1015 $localPath = '';
1016 }
1017 }
1018 return $localPath;
1019 }
1020
1021 /**
1022 * Attempts to create a local file path by matching absRefPrefix. This
1023 * requires TSFE. If TSFE is missing, this function does nothing.
1024 *
1025 * @param string $sourcePath
1026 * @return string
1027 */
1028 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1029 {
1030 $localPath = '';
1031 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1032 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1033 $absRefPrefixLength = strlen($absRefPrefix);
1034 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1035 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1036 $localPath = PATH_site . $sourcePath;
1037 if (!self::isAllowedLocalFile($localPath)) {
1038 $localPath = '';
1039 }
1040 }
1041 }
1042 return $localPath;
1043 }
1044
1045 /**
1046 * Attempts to create a local file path from the absolute URL without
1047 * schema.
1048 *
1049 * @param string $sourcePath
1050 * @return string
1051 */
1052 protected function createLocalPathFromAbsoluteURL($sourcePath)
1053 {
1054 $localPath = '';
1055 if ($sourcePath[0] == '/') {
1056 $sourcePath = substr($sourcePath, 1);
1057 $localPath = PATH_site . $sourcePath;
1058 if (!self::isAllowedLocalFile($localPath)) {
1059 $localPath = '';
1060 }
1061 }
1062 return $localPath;
1063 }
1064
1065 /**
1066 * Attempts to create a local file path from the relative URL.
1067 *
1068 * @param string $sourcePath
1069 * @return string
1070 */
1071 protected function createLocalPathFromRelativeURL($sourcePath)
1072 {
1073 $localPath = '';
1074 if (self::isRelativeURL($sourcePath)) {
1075 $localPath = PATH_site . $sourcePath;
1076 if (!self::isAllowedLocalFile($localPath)) {
1077 $localPath = '';
1078 }
1079 }
1080 return $localPath;
1081 }
1082
1083 /**
1084 * Checks if URL is relative.
1085 *
1086 * @param string $url
1087 * @return bool
1088 */
1089 protected static function isRelativeURL($url)
1090 {
1091 $urlParts = @parse_url($url);
1092 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1093 }
1094
1095 /**
1096 * Checks if the path points to the file inside the web site
1097 *
1098 * @param string $filePath
1099 * @return bool
1100 */
1101 protected static function isAllowedLocalFile($filePath)
1102 {
1103 $filePath = GeneralUtility::resolveBackPath($filePath);
1104 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1105 $isFile = is_file($filePath);
1106 return $insideWebPath && $isFile;
1107 }
1108
1109 /******************************************
1110 *
1111 * Indexing; external files (PDF, DOC, etc)
1112 *
1113 ******************************************/
1114 /**
1115 * Indexing a regular document given as $file (relative to PATH_site, local file)
1116 *
1117 * @param string $file Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1118 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1119 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1120 * @param string $altExtension File extension for temporary file.
1121 * @return void
1122 */
1123 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1124 {
1125 // Init
1126 $fI = pathinfo($file);
1127 $ext = $altExtension ?: strtolower($fI['extension']);
1128 // Create abs-path:
1129 if (!$contentTmpFile) {
1130 if (!GeneralUtility::isAbsPath($file)) {
1131 // Relative, prepend PATH_site:
1132 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1133 } else {
1134 // Absolute, pass-through:
1135 $absFile = $file;
1136 }
1137 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1138 } else {
1139 $absFile = $contentTmpFile;
1140 }
1141 // Indexing the document:
1142 if ($absFile && @is_file($absFile)) {
1143 if ($this->external_parsers[$ext]) {
1144 $fileInfo = stat($absFile);
1145 $cParts = $this->fileContentParts($ext, $absFile);
1146 foreach ($cParts as $cPKey) {
1147 $this->internal_log = array();
1148 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1149 $Pstart = GeneralUtility::milliseconds();
1150 $subinfo = array('key' => $cPKey);
1151 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1152 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1153 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1154 if ($check > 0 || $force) {
1155 if ($check > 0) {
1156 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1157 } else {
1158 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1159 }
1160 // Check external file counter:
1161 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1162 // Divide into title,keywords,description and body:
1163 $this->log_push('Split content', '');
1164 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1165 $this->log_pull();
1166 if (is_array($contentParts)) {
1167 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1168 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1169 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1170 // Increment counter:
1171 $this->externalFileCounter++;
1172 // Splitting words
1173 $this->log_push('Extract words from content', '');
1174 $splitInWords = $this->processWordsInArrays($contentParts);
1175 $this->log_pull();
1176 // Analyse the indexed words.
1177 $this->log_push('Analyse the extracted words', '');
1178 $indexArr = $this->indexAnalyze($splitInWords);
1179 $this->log_pull();
1180 // Submitting page (phash) record
1181 $this->log_push('Submitting page', '');
1182 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1183 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1184 $this->log_pull();
1185 // Check words and submit to word list if not there
1186 $this->log_push('Check word list and submit words', '');
1187 if (IndexedSearchUtility::isTableUsed('index_words')) {
1188 $this->checkWordList($indexArr);
1189 $this->submitWords($indexArr, $phash_arr['phash']);
1190 }
1191 $this->log_pull();
1192 // Set parsetime
1193 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1194 } else {
1195 // Update the timestamp
1196 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1197 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1198 }
1199 } else {
1200 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1201 }
1202 } else {
1203 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1204 }
1205 } else {
1206 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1207 }
1208 // Checking and setting sections:
1209 $this->submitFile_section($phash_arr['phash']);
1210 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1211 $this->log_pull();
1212 }
1213 } else {
1214 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1215 }
1216 } else {
1217 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1218 }
1219 }
1220
1221 /**
1222 * Reads the content of an external file being indexed.
1223 * The content from the external parser MUST be returned in utf-8!
1224 *
1225 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1226 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1227 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1228 * @return array Standard content array (title, description, keywords, body keys)
1229 */
1230 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1231 {
1232 $contentArray = null;
1233 // Consult relevant external document parser:
1234 if (is_object($this->external_parsers[$fileExtension])) {
1235 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1236 }
1237 return $contentArray;
1238 }
1239
1240 /**
1241 * Creates an array with pointers to divisions of document.
1242 *
1243 * @param string $ext File extension
1244 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1245 * @return array Array of pointers to sections that the document should be divided into
1246 */
1247 public function fileContentParts($ext, $absFile)
1248 {
1249 $cParts = array(0);
1250 // Consult relevant external document parser:
1251 if (is_object($this->external_parsers[$ext])) {
1252 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1253 }
1254 return $cParts;
1255 }
1256
1257 /**
1258 * Splits non-HTML content (from external files for instance)
1259 *
1260 * @param string $content Input content (non-HTML) to index.
1261 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1262 * @see splitHTMLContent()
1263 */
1264 public function splitRegularContent($content)
1265 {
1266 $contentArr = $this->defaultContentArray;
1267 $contentArr['body'] = $content;
1268 return $contentArr;
1269 }
1270
1271 /**********************************
1272 *
1273 * Analysing content, Extracting words
1274 *
1275 **********************************/
1276 /**
1277 * Convert character set and HTML entities in the value of input content array keys
1278 *
1279 * @param array $contentArr Standard content array
1280 * @param string $charset Charset of the input content (converted to utf-8)
1281 * @return void
1282 */
1283 public function charsetEntity2utf8(&$contentArr, $charset)
1284 {
1285 // Convert charset if necessary
1286 foreach ($contentArr as $key => $value) {
1287 if ((string)$contentArr[$key] !== '') {
1288 if ($charset !== 'utf-8') {
1289 $contentArr[$key] = $this->csObj->conv($contentArr[$key], $charset, 'utf-8');
1290 }
1291 // decode all numeric / html-entities in the string to real characters:
1292 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], true);
1293 }
1294 }
1295 }
1296
1297 /**
1298 * Processing words in the array from split*Content -functions
1299 *
1300 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1301 * @return array Content input array modified so each key is not a unique array of words
1302 */
1303 public function processWordsInArrays($contentArr)
1304 {
1305 // split all parts to words
1306 foreach ($contentArr as $key => $value) {
1307 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1308 }
1309 // For title, keywords, and description we don't want duplicates:
1310 $contentArr['title'] = array_unique($contentArr['title']);
1311 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1312 $contentArr['description'] = array_unique($contentArr['description']);
1313 // Return modified array:
1314 return $contentArr;
1315 }
1316
1317 /**
1318 * Extracts the sample description text from the content array.
1319 *
1320 * @param array $contentArr Content array
1321 * @return string Description string
1322 */
1323 public function bodyDescription($contentArr)
1324 {
1325 // Setting description
1326 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1327 if ($maxL) {
1328 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1329 // Shorten the string:
1330 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1331 }
1332 return $bodyDescription;
1333 }
1334
1335 /**
1336 * Analyzes content to use for indexing,
1337 *
1338 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1339 * @return array Index Array (whatever that is...)
1340 */
1341 public function indexAnalyze($content)
1342 {
1343 $indexArr = array();
1344 $counter = 0;
1345 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1346 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1347 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1348 $this->analyzeBody($indexArr, $content);
1349 return $indexArr;
1350 }
1351
1352 /**
1353 * Calculates relevant information for headercontent
1354 *
1355 * @param array $retArr Index array, passed by reference
1356 * @param array $content Standard content array
1357 * @param string $key Key from standard content array
1358 * @param int $offset Bit-wise priority to type
1359 * @return void
1360 */
1361 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1362 {
1363 foreach ($content[$key] as $val) {
1364 $val = substr($val, 0, 60);
1365 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1366 if (!isset($retArr[$val])) {
1367 // Word ID (wid)
1368 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1369 // Metaphone value is also 60 only chars long
1370 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1371 $retArr[$val]['metaphone'] = $metaphone;
1372 }
1373 // Build metaphone fulltext string (can be used for fulltext indexing)
1374 if ($this->storeMetaphoneInfoAsWords) {
1375 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1376 }
1377 // Priority used for flagBitMask feature (see extension configuration)
1378 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1379 // Increase number of occurrences
1380 $retArr[$val]['count']++;
1381 $this->wordcount++;
1382 }
1383 }
1384
1385 /**
1386 * Calculates relevant information for bodycontent
1387 *
1388 * @param array $retArr Index array, passed by reference
1389 * @param array $content Standard content array
1390 * @return void
1391 */
1392 public function analyzeBody(&$retArr, $content)
1393 {
1394 foreach ($content['body'] as $key => $val) {
1395 $val = substr($val, 0, 60);
1396 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1397 if (!isset($retArr[$val])) {
1398 // First occurrence (used for ranking results)
1399 $retArr[$val]['first'] = $key;
1400 // Word ID (wid)
1401 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1402 // Metaphone value is also only 60 chars long
1403 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1404 $retArr[$val]['metaphone'] = $metaphone;
1405 }
1406 // Build metaphone fulltext string (can be used for fulltext indexing)
1407 if ($this->storeMetaphoneInfoAsWords) {
1408 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1409 }
1410 // Increase number of occurrences
1411 $retArr[$val]['count']++;
1412 $this->wordcount++;
1413 }
1414 }
1415
1416 /**
1417 * Creating metaphone based hash from input word
1418 *
1419 * @param string $word Word to convert
1420 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1421 * @return mixed Metaphone hash integer (or raw value, string)
1422 */
1423 public function metaphone($word, $returnRawMetaphoneValue = false)
1424 {
1425 if (is_object($this->metaphoneObj)) {
1426 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1427 } else {
1428 // Use native PHP function instead of advanced doubleMetaphone class
1429 $metaphoneRawValue = metaphone($word);
1430 }
1431 if ($returnRawMetaphoneValue) {
1432 $result = $metaphoneRawValue;
1433 } elseif ($metaphoneRawValue !== '') {
1434 // Create hash and return integer
1435 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1436 } else {
1437 $result = 0;
1438 }
1439 return $result;
1440 }
1441
1442 /********************************
1443 *
1444 * SQL; TYPO3 Pages
1445 *
1446 *******************************/
1447 /**
1448 * Updates db with information about the page (TYPO3 page, not external media)
1449 *
1450 * @return void
1451 */
1452 public function submitPage()
1453 {
1454 // Remove any current data for this phash:
1455 $this->removeOldIndexedPages($this->hash['phash']);
1456 // setting new phash_row
1457 $fields = array(
1458 'phash' => $this->hash['phash'],
1459 'phash_grouping' => $this->hash['phash_grouping'],
1460 'cHashParams' => serialize($this->cHashParams),
1461 'contentHash' => $this->content_md5h,
1462 'data_page_id' => $this->conf['id'],
1463 'data_page_reg1' => $this->conf['page_cache_reg1'],
1464 'data_page_type' => $this->conf['type'],
1465 'data_page_mp' => $this->conf['MP'],
1466 'gr_list' => $this->conf['gr_list'],
1467 'item_type' => 0,
1468 // TYPO3 page
1469 'item_title' => $this->contentParts['title'],
1470 'item_description' => $this->bodyDescription($this->contentParts),
1471 'item_mtime' => (int)$this->conf['mtime'],
1472 'item_size' => strlen($this->conf['content']),
1473 'tstamp' => $GLOBALS['EXEC_TIME'],
1474 'crdate' => $GLOBALS['EXEC_TIME'],
1475 'item_crdate' => $this->conf['crdate'],
1476 // Creation date of page
1477 'sys_language_uid' => $this->conf['sys_language_uid'],
1478 // Sys language uid of the page. Should reflect which language it DOES actually display!
1479 'externalUrl' => 0,
1480 'recordUid' => (int)$this->conf['recordUid'],
1481 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1482 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1483 );
1484 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1485 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1486 }
1487 // PROCESSING index_section
1488 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1489 // PROCESSING index_grlist
1490 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1491 // PROCESSING index_fulltext
1492 $fields = array(
1493 'phash' => $this->hash['phash'],
1494 'fulltextdata' => implode(' ', $this->contentParts),
1495 'metaphonedata' => $this->metaphoneContent
1496 );
1497 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1498 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1499 }
1500 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1501 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1502 }
1503 // PROCESSING index_debug
1504 if ($this->indexerConfig['debugMode']) {
1505 $fields = array(
1506 'phash' => $this->hash['phash'],
1507 'debuginfo' => serialize(array(
1508 'cHashParams' => $this->cHashParams,
1509 'external_parsers initialized' => array_keys($this->external_parsers),
1510 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1511 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1512 'logs' => $this->internal_log,
1513 'lexer' => $this->lexerObj->debugString
1514 ))
1515 );
1516 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1517 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1518 }
1519 }
1520 }
1521
1522 /**
1523 * Stores gr_list in the database.
1524 *
1525 * @param int $hash Search result record phash
1526 * @param int $phash_x Actual phash of current content
1527 * @return void
1528 * @see update_grlist()
1529 */
1530 public function submit_grlist($hash, $phash_x)
1531 {
1532 // Setting the gr_list record
1533 $fields = array(
1534 'phash' => $hash,
1535 'phash_x' => $phash_x,
1536 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1537 'gr_list' => $this->conf['gr_list']
1538 );
1539 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1540 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1541 }
1542 }
1543
1544 /**
1545 * Stores section
1546 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1547 *
1548 * @param int $hash phash of TYPO3 parent search result record
1549 * @param int $hash_t3 phash of the file indexation search record
1550 * @return void
1551 */
1552 public function submit_section($hash, $hash_t3)
1553 {
1554 $fields = array(
1555 'phash' => $hash,
1556 'phash_t3' => $hash_t3,
1557 'page_id' => (int)$this->conf['id']
1558 );
1559 $this->getRootLineFields($fields);
1560 if (IndexedSearchUtility::isTableUsed('index_section')) {
1561 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1562 }
1563 }
1564
1565 /**
1566 * Removes records for the indexed page, $phash
1567 *
1568 * @param int $phash phash value to flush
1569 * @return void
1570 */
1571 public function removeOldIndexedPages($phash)
1572 {
1573 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1574 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1575 foreach ($tableArray as $table) {
1576 if (IndexedSearchUtility::isTableUsed($table)) {
1577 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1578 }
1579 }
1580 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1581 if (IndexedSearchUtility::isTableUsed('index_section')) {
1582 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . (int)$phash);
1583 }
1584 }
1585
1586 /********************************
1587 *
1588 * SQL; External media
1589 *
1590 *******************************/
1591 /**
1592 * Updates db with information about the file
1593 *
1594 * @param array $hash Array with phash and phash_grouping keys for file
1595 * @param string $file File name
1596 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1597 * @param string $ext File extension determining the type of media.
1598 * @param int $mtime Modification time of file.
1599 * @param int $ctime Creation time of file.
1600 * @param int $size Size of file in bytes
1601 * @param int $content_md5h Content HASH value.
1602 * @param array $contentParts Standard content array (using only title and body for a file)
1603 * @return void
1604 */
1605 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1606 {
1607 // Find item Type:
1608 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1609 $storeItemType = $storeItemType ?: $ext;
1610 // Remove any current data for this phash:
1611 $this->removeOldIndexedFiles($hash['phash']);
1612 // Split filename:
1613 $fileParts = parse_url($file);
1614 // Setting new
1615 $fields = array(
1616 'phash' => $hash['phash'],
1617 'phash_grouping' => $hash['phash_grouping'],
1618 'cHashParams' => serialize($subinfo),
1619 'contentHash' => $content_md5h,
1620 'data_filename' => $file,
1621 'item_type' => $storeItemType,
1622 'item_title' => trim($contentParts['title']) ?: basename($file),
1623 'item_description' => $this->bodyDescription($contentParts),
1624 'item_mtime' => $mtime,
1625 'item_size' => $size,
1626 'item_crdate' => $ctime,
1627 'tstamp' => $GLOBALS['EXEC_TIME'],
1628 'crdate' => $GLOBALS['EXEC_TIME'],
1629 'gr_list' => $this->conf['gr_list'],
1630 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1631 'recordUid' => (int)$this->conf['recordUid'],
1632 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1633 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1634 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1635 );
1636 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1637 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1638 }
1639 // PROCESSING index_fulltext
1640 $fields = array(
1641 'phash' => $hash['phash'],
1642 'fulltextdata' => implode(' ', $contentParts),
1643 'metaphonedata' => $this->metaphoneContent
1644 );
1645 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1646 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1647 }
1648 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1649 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1650 }
1651 // PROCESSING index_debug
1652 if ($this->indexerConfig['debugMode']) {
1653 $fields = array(
1654 'phash' => $hash['phash'],
1655 'debuginfo' => serialize(array(
1656 'cHashParams' => $subinfo,
1657 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1658 'logs' => $this->internal_log,
1659 'lexer' => $this->lexerObj->debugString
1660 ))
1661 );
1662 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1663 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1664 }
1665 }
1666 }
1667
1668 /**
1669 * Stores file gr_list for a file IF it does not exist already
1670 *
1671 * @param int $hash phash value of file
1672 * @return void
1673 */
1674 public function submitFile_grlist($hash)
1675 {
1676 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1677 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1678 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$hash . ' AND (hash_gr_list=' . IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1679 if ($count == 0) {
1680 $this->submit_grlist($hash, $hash);
1681 }
1682 }
1683 }
1684
1685 /**
1686 * Stores file section for a file IF it does not exist
1687 *
1688 * @param int $hash phash value of file
1689 * @return void
1690 */
1691 public function submitFile_section($hash)
1692 {
1693 // Testing if there is already a section
1694 if (IndexedSearchUtility::isTableUsed('index_section')) {
1695 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . (int)$hash . ' AND page_id=' . (int)$this->conf['id']);
1696 if ($count == 0) {
1697 $this->submit_section($hash, $this->hash['phash']);
1698 }
1699 }
1700 }
1701
1702 /**
1703 * Removes records for the indexed page, $phash
1704 *
1705 * @param int $phash phash value to flush
1706 * @return void
1707 */
1708 public function removeOldIndexedFiles($phash)
1709 {
1710 // Removing old registrations for tables.
1711 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1712 foreach ($tableArray as $table) {
1713 if (IndexedSearchUtility::isTableUsed($table)) {
1714 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1715 }
1716 }
1717 }
1718
1719 /********************************
1720 *
1721 * SQL Helper functions
1722 *
1723 *******************************/
1724 /**
1725 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1726 * Return positive integer if the page needs to be indexed
1727 *
1728 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1729 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1730 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1731 */
1732 public function checkMtimeTstamp($mtime, $phash)
1733 {
1734 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1735 // Not indexed (not in index_phash)
1736 $result = 4;
1737 } else {
1738 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . (int)$phash);
1739 // If there was an indexing of the page...:
1740 if ($row) {
1741 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1742 // If max age is exceeded, index the page
1743 // The configured max-age was exceeded for the document and thus it's indexed.
1744 $result = 1;
1745 } else {
1746 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1747 // if minAge is not set or if minAge is exceeded, consider at mtime
1748 if ($mtime) {
1749 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1750 if ($row['item_mtime'] != $mtime) {
1751 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1752 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1753 $result = 2;
1754 } else {
1755 // mtime matched the document, so no changes detected and no content updated
1756 $result = -1;
1757 if ($this->tstamp_maxAge) {
1758 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1759 } else {
1760 $this->updateTstamp($phash);
1761 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1762 }
1763 }
1764 } else {
1765 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1766 $result = 3;
1767 }
1768 } else {
1769 // The minimum age was not exceeded
1770 $result = -2;
1771 }
1772 }
1773 } else {
1774 // Page has never been indexed (is not represented in the index_phash table).
1775 $result = 4;
1776 }
1777 }
1778 return $result;
1779 }
1780
1781 /**
1782 * Check content hash in phash table
1783 *
1784 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1785 */
1786 public function checkContentHash()
1787 {
1788 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1789 $result = true;
1790 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1791 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . (int)$this->hash['phash_grouping'] . ' AND contentHash=' . (int)$this->content_md5h);
1792 if ($row) {
1793 $result = $row;
1794 }
1795 }
1796 return $result;
1797 }
1798
1799 /**
1800 * Check content hash for external documents
1801 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1802 *
1803 * @param int $hashGr phash value to check (phash_grouping)
1804 * @param int $content_md5h Content hash to check
1805 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1806 */
1807 public function checkExternalDocContentHash($hashGr, $content_md5h)
1808 {
1809 $result = true;
1810 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1811 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . (int)$hashGr . ' AND contentHash=' . (int)$content_md5h);
1812 $result = $count == 0;
1813 }
1814 return $result;
1815 }
1816
1817 /**
1818 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1819 *
1820 * @param int $phash_x Phash integer to test.
1821 * @return bool
1822 */
1823 public function is_grlist_set($phash_x)
1824 {
1825 $result = false;
1826 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1827 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . (int)$phash_x);
1828 $result = $count > 0;
1829 }
1830 return $result;
1831 }
1832
1833 /**
1834 * Check if an grlist-entry for this hash exists and if not so, write one.
1835 *
1836 * @param int $phash phash of the search result that should be found
1837 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1838 * @return void
1839 * @see submit_grlist()
1840 */
1841 public function update_grlist($phash, $phash_x)
1842 {
1843 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1844 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$phash . ' AND hash_gr_list=' . IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1845 if ($count == 0) {
1846 $this->submit_grlist($phash, $phash_x);
1847 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1848 }
1849 }
1850 }
1851
1852 /**
1853 * Update tstamp for a phash row.
1854 *
1855 * @param int $phash phash value
1856 * @param int $mtime If set, update the mtime field to this value.
1857 * @return void
1858 */
1859 public function updateTstamp($phash, $mtime = 0)
1860 {
1861 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1862 $updateFields = array(
1863 'tstamp' => $GLOBALS['EXEC_TIME']
1864 );
1865 if ($mtime) {
1866 $updateFields['item_mtime'] = (int)$mtime;
1867 }
1868 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1869 }
1870 }
1871
1872 /**
1873 * Update SetID of the index_phash record.
1874 *
1875 * @param int $phash phash value
1876 * @return void
1877 */
1878 public function updateSetId($phash)
1879 {
1880 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1881 $updateFields = array(
1882 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1883 );
1884 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1885 }
1886 }
1887
1888 /**
1889 * Update parsetime for phash row.
1890 *
1891 * @param int $phash phash value.
1892 * @param int $parsetime Parsetime value to set.
1893 * @return void
1894 */
1895 public function updateParsetime($phash, $parsetime)
1896 {
1897 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1898 $updateFields = array(
1899 'parsetime' => (int)$parsetime
1900 );
1901 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1902 }
1903 }
1904
1905 /**
1906 * Update section rootline for the page
1907 *
1908 * @return void
1909 */
1910 public function updateRootline()
1911 {
1912 if (IndexedSearchUtility::isTableUsed('index_section')) {
1913 $updateFields = array();
1914 $this->getRootLineFields($updateFields);
1915 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . (int)$this->conf['id'], $updateFields);
1916 }
1917 }
1918
1919 /**
1920 * Adding values for root-line fields.
1921 * rl0, rl1 and rl2 are standard. A hook might add more.
1922 *
1923 * @param array $fieldArray Field array, passed by reference
1924 * @return void
1925 */
1926 public function getRootLineFields(array &$fieldArray)
1927 {
1928 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1929 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1930 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1931 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1932 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1933 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1934 }
1935 }
1936 }
1937
1938 /**
1939 * Includes the crawler class
1940 *
1941 * @return void
1942 */
1943 public function includeCrawlerClass()
1944 {
1945 require_once \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php';
1946 }
1947
1948 /********************************
1949 *
1950 * SQL; Submitting words
1951 *
1952 *******************************/
1953 /**
1954 * Adds new words to db
1955 *
1956 * @param array $wordListArray Word List array (where each word has information about position etc).
1957 * @return void
1958 */
1959 public function checkWordList($wordListArray)
1960 {
1961 if (IndexedSearchUtility::isTableUsed('index_words')) {
1962 if (!empty($wordListArray)) {
1963 $phashArray = array();
1964 foreach ($wordListArray as $value) {
1965 $phashArray[] = (int)$value['hash'];
1966 }
1967 $cwl = implode(',', $phashArray);
1968 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1969 $wordListArrayCount = count($wordListArray);
1970 if ($count !== $wordListArrayCount) {
1971 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1972 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
1973 while (false != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1974 unset($wordListArray[$row['baseword']]);
1975 }
1976 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1977 foreach ($wordListArray as $key => $val) {
1978 $insertFields = array(
1979 'wid' => $val['hash'],
1980 'baseword' => $key,
1981 'metaphone' => $val['metaphone']
1982 );
1983 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1984 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1985 }
1986 }
1987 }
1988 }
1989 }
1990
1991 /**
1992 * Submits RELATIONS between words and phash
1993 *
1994 * @param array $wordList Word list array
1995 * @param int $phash phash value
1996 * @return void
1997 */
1998 public function submitWords($wordList, $phash)
1999 {
2000 if (IndexedSearchUtility::isTableUsed('index_rel')) {
2001 $stopWords = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('wid', 'index_words', 'is_stopword != 0', '', '', '', 'wid');
2002
2003 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . (int)$phash);
2004 $fields = array('phash', 'wid', 'count', 'first', 'freq', 'flags');
2005 $rows = array();
2006 foreach ($wordList as $val) {
2007 if (isset($stopWords[$val['hash']])) {
2008 continue;
2009 }
2010 $rows[] = array(
2011 (int)$phash,
2012 (int)$val['hash'],
2013 (int)$val['count'],
2014 (int)$val['first'],
2015 $this->freqMap($val['count'] / $this->wordcount),
2016 $val['cmp'] & $this->flagBitMask
2017 );
2018 }
2019 $GLOBALS['TYPO3_DB']->exec_INSERTmultipleRows('index_rel', $fields, $rows);
2020 }
2021 }
2022
2023 /**
2024 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2025 * and back.
2026 *
2027 * @param float $freq Frequency
2028 * @return int Frequency in range.
2029 */
2030 public function freqMap($freq)
2031 {
2032 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2033 if ($freq <= 1) {
2034 $newFreq = $freq * $mapFactor;
2035 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2036 } else {
2037 $newFreq = $freq / $mapFactor;
2038 }
2039 return $newFreq;
2040 }
2041
2042 /********************************
2043 *
2044 * Hashing
2045 *
2046 *******************************/
2047 /**
2048 * Get search hash, T3 pages
2049 *
2050 * @return void
2051 */
2052 public function setT3Hashes()
2053 {
2054 // Set main array:
2055 $hArray = array(
2056 'id' => (int)$this->conf['id'],
2057 'type' => (int)$this->conf['type'],
2058 'sys_lang' => (int)$this->conf['sys_language_uid'],
2059 'MP' => (string)$this->conf['MP'],
2060 'cHash' => $this->cHashParams
2061 );
2062 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2063 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2064 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2065 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2066 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2067 }
2068
2069 /**
2070 * Get search hash, external files
2071 *
2072 * @param string $file File name / path which identifies it on the server
2073 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2074 * @return array Array with "phash_grouping" and "phash" inside.
2075 */
2076 public function setExtHashes($file, $subinfo = array())
2077 {
2078 // Set main array:
2079 $hash = array();
2080 $hArray = array(
2081 'file' => $file
2082 );
2083 // Set grouping hash:
2084 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2085 // Add subinfo
2086 $hArray['subinfo'] = $subinfo;
2087 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2088 return $hash;
2089 }
2090
2091 /*********************************
2092 *
2093 * Internal logging functions
2094 *
2095 *********************************/
2096 /**
2097 * Push function wrapper for TT logging
2098 *
2099 * @param string $msg Title to set
2100 * @param string $key Key (?)
2101 * @return void
2102 */
2103 public function log_push($msg, $key)
2104 {
2105 $this->timeTracker->push($msg, $key);
2106 }
2107
2108 /**
2109 * Pull function wrapper for TT logging
2110 *
2111 * @return void
2112 */
2113 public function log_pull()
2114 {
2115 $this->timeTracker->pull();
2116 }
2117
2118 /**
2119 * Set log message function wrapper for TT logging
2120 *
2121 * @param string $msg Message to set
2122 * @param int $errorNum Error number
2123 * @return void
2124 */
2125 public function log_setTSlogMessage($msg, $errorNum = 0)
2126 {
2127 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2128 $this->internal_log[] = $msg;
2129 }
2130
2131 /**
2132 * Makes sure that keywords are space-separated. This is impotant for their
2133 * proper displaying as a part of fulltext index.
2134 *
2135 * @param string $keywordList
2136 * @return string
2137 * @see http://forge.typo3.org/issues/14959
2138 */
2139 protected function addSpacesToKeywordList($keywordList)
2140 {
2141 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2142 return ' ' . implode(', ', $keywords) . ' ';
2143 }
2144 }