01cc19abbe09b8e225f200ea1e4a385e73711104
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18 use TYPO3\CMS\Core\Utility\MathUtility;
19 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
20 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
21
22 /**
23 * Indexing class for TYPO3 frontend
24 */
25 class Indexer
26 {
27 /**
28 * @var array
29 */
30 public $reasons = array(
31 -1 => 'mtime matched the document, so no changes detected and no content updated',
32 -2 => 'The minimum age was not exceeded',
33 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
34 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
35 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
36 4 => 'Page has never been indexed (is not represented in the index_phash table).'
37 );
38
39 /**
40 * HTML code blocks to exclude from indexing
41 *
42 * @var string
43 */
44 public $excludeSections = 'script,style';
45
46 /**
47 * Supported Extensions for external files
48 *
49 * @var array
50 */
51 public $external_parsers = array();
52
53 /**
54 * External parser objects, keys are file extension names. Values are objects with certain methods.
55 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
56 * in access limited pages!)
57 *
58 * @var string
59 */
60 public $defaultGrList = '0,-1';
61
62 /**
63 * Min/Max times
64 *
65 * @var int
66 */
67 public $tstamp_maxAge = 0;
68
69 /**
70 * If set, this tells a number of seconds that is the maximum age of an indexed document.
71 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
72 *
73 * @var int
74 */
75 public $tstamp_minAge = 0;
76
77 /**
78 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
79 *
80 * @var int
81 */
82 public $maxExternalFiles = 0;
83
84 /**
85 * Max number of external files to index.
86 *
87 * @var bool
88 */
89 public $forceIndexing = false;
90
91 /**
92 * If TRUE, indexing is forced despite of hashes etc.
93 *
94 * @var bool
95 */
96 public $crawlerActive = false;
97
98 /**
99 * Set when crawler is detected (internal)
100 *
101 * @var array
102 */
103 public $defaultContentArray = array(
104 'title' => '',
105 'description' => '',
106 'keywords' => '',
107 'body' => ''
108 );
109
110 /**
111 * @var int
112 */
113 public $wordcount = 0;
114
115 /**
116 * @var int
117 */
118 public $externalFileCounter = 0;
119
120 /**
121 * @var array
122 */
123 public $conf = array();
124
125 /**
126 * Configuration set internally (see init functions for required keys and their meaning)
127 *
128 * @var array
129 */
130 public $indexerConfig = array();
131
132 /**
133 * Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
134 *
135 * @var array
136 */
137 public $hash = array();
138
139 /**
140 * Hash array, contains phash and phash_grouping
141 *
142 * @var array
143 */
144 public $file_phash_arr = array();
145
146 /**
147 * Hash array for files
148 *
149 * @var array
150 */
151 public $contentParts = array();
152
153 /**
154 * Content of TYPO3 page
155 *
156 * @var string
157 */
158 public $content_md5h = '';
159
160 /**
161 * @var array
162 */
163 public $internal_log = array();
164
165 /**
166 * Internal log
167 *
168 * @var string
169 */
170 public $indexExternalUrl_content = '';
171
172 /**
173 * @var array
174 */
175 public $cHashParams = array();
176
177 /**
178 * cHashparams array
179 *
180 * @var int
181 */
182 public $freqRange = 32000;
183
184 /**
185 * @var float
186 */
187 public $freqMax = 0.1;
188
189 /**
190 * @var bool
191 */
192 public $enableMetaphoneSearch = false;
193
194 /**
195 * @var bool
196 */
197 public $storeMetaphoneInfoAsWords;
198
199 /**
200 * @var string
201 */
202 public $metaphoneContent = '';
203
204 /**
205 * Charset class object
206 *
207 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
208 */
209 public $csObj;
210
211 /**
212 * Metaphone object, if any
213 *
214 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
215 */
216 public $metaphoneObj;
217
218 /**
219 * Lexer object for word splitting
220 *
221 * @var \TYPO3\CMS\IndexedSearch\Lexer
222 */
223 public $lexerObj;
224
225 /**
226 * @var bool
227 */
228 public $flagBitMask;
229
230 /**
231 * Parent Object (TSFE) Initialization
232 *
233 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
234 * @return void
235 */
236 public function hook_indexContent(&$pObj)
237 {
238 // Indexer configuration from Extension Manager interface:
239 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
240 // Crawler activation:
241 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
242 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
243 // Setting simple log message:
244 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
245 // Setting variables:
246 $this->crawlerActive = true;
247 // Crawler active flag
248 $this->forceIndexing = true;
249 }
250 // Determine if page should be indexed, and if so, configure and initialize indexer
251 if ($pObj->config['config']['index_enable']) {
252 $this->log_push('Index page', '');
253 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
254 if (!$pObj->page['no_search']) {
255 if (!$pObj->no_cache) {
256 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
257 // Setting up internal configuration from config array:
258 $this->conf = array();
259 // Information about page for which the indexing takes place
260 $this->conf['id'] = $pObj->id;
261 // Page id
262 $this->conf['type'] = $pObj->type;
263 // Page type
264 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
265 // sys_language UID of the language of the indexing.
266 $this->conf['MP'] = $pObj->MP;
267 // MP variable, if any (Mount Points)
268 $this->conf['gr_list'] = $pObj->gr_list;
269 // Group list
270 $this->conf['cHash'] = $pObj->cHash;
271 // cHash string for additional parameters
272 $this->conf['cHash_array'] = $pObj->cHash_array;
273 // Array of the additional parameters
274 $this->conf['crdate'] = $pObj->page['crdate'];
275 // The creation date of the TYPO3 page
276 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
277 // reg1 of the caching table. Not known what practical use this has.
278 // Root line uids
279 $this->conf['rootline_uids'] = array();
280 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
281 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
282 }
283 // Content of page:
284 $this->conf['content'] = $pObj->content;
285 // Content string (HTML of TYPO3 page)
286 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
287 // Alternative title for indexing
288 $this->conf['metaCharset'] = $pObj->metaCharset;
289 // Character set of content (will be converted to utf-8 during indexing)
290 $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
291 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
292 // Configuration of behavior:
293 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
294 // Whether to index external documents like PDF, DOC etc. (if possible)
295 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
296 // Length of description text (max 250, default 200)
297 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
298 // Set to zero:
299 $this->conf['recordUid'] = 0;
300 $this->conf['freeIndexUid'] = 0;
301 $this->conf['freeIndexSetId'] = 0;
302 // Init and start indexing:
303 $this->init();
304 $this->indexTypo3PageContent();
305 } else {
306 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
307 }
308 } else {
309 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
310 }
311 } else {
312 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
313 }
314 } else {
315 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
316 }
317 $this->log_pull();
318 }
319 }
320
321 /****************************
322 *
323 * Backend API
324 *
325 ****************************/
326 /**
327 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
328 *
329 * @param int $id The page uid, &id=
330 * @param int $type The page type, &type=
331 * @param int $sys_language_uid sys_language uid, typically &L=
332 * @param string $MP The MP variable (Mount Points), &MP=
333 * @param array $uidRL Rootline array of only UIDs.
334 * @param array $cHash_array Array of GET variables to register with this indexing
335 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
336 * @return void
337 */
338 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = false)
339 {
340 // Setting up internal configuration from config array:
341 $this->conf = array();
342 // Information about page for which the indexing takes place
343 $this->conf['id'] = $id;
344 // Page id (int)
345 $this->conf['type'] = $type;
346 // Page type (int)
347 $this->conf['sys_language_uid'] = $sys_language_uid;
348 // sys_language UID of the language of the indexing (int)
349 $this->conf['MP'] = $MP;
350 // MP variable, if any (Mount Points) (string)
351 $this->conf['gr_list'] = '0,-1';
352 // Group list (hardcoded for now...)
353 // cHash values:
354 if ($createCHash) {
355 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
356 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
357 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
358 } else {
359 $this->conf['cHash'] = '';
360 }
361 // cHash string for additional parameters
362 $this->conf['cHash_array'] = $cHash_array;
363 // Array of the additional parameters
364 // Set to defaults
365 $this->conf['freeIndexUid'] = 0;
366 $this->conf['freeIndexSetId'] = 0;
367 $this->conf['page_cache_reg1'] = '';
368 // Root line uids
369 $this->conf['rootline_uids'] = $uidRL;
370 // Configuration of behavior:
371 $this->conf['index_externals'] = 1;
372 // Whether to index external documents like PDF, DOC etc. (if possible)
373 $this->conf['index_descrLgd'] = 200;
374 // Length of description text (max 250, default 200)
375 $this->conf['index_metatags'] = true;
376 // Whether to index document keywords and description (if present)
377 // Init and start indexing:
378 $this->init();
379 }
380
381 /**
382 * Sets the free-index uid. Can be called right after backend_initIndexer()
383 *
384 * @param int $freeIndexUid Free index UID
385 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
386 * @return void
387 */
388 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
389 {
390 $this->conf['freeIndexUid'] = $freeIndexUid;
391 $this->conf['freeIndexSetId'] = $freeIndexSetId;
392 }
393
394 /**
395 * Indexing records as the content of a TYPO3 page.
396 *
397 * @param string $title Title equivalent
398 * @param string $keywords Keywords equivalent
399 * @param string $description Description equivalent
400 * @param string $content The main content to index
401 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
402 * @param int $mtime Last modification time, in seconds
403 * @param int $crdate The creation date of the content, in seconds
404 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
405 * @return void
406 */
407 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
408 {
409 // Content of page:
410 $this->conf['mtime'] = $mtime;
411 // Most recent modification time (seconds) of the content
412 $this->conf['crdate'] = $crdate;
413 // The creation date of the TYPO3 content
414 $this->conf['recordUid'] = $recordUid;
415 // UID of the record, if applicable
416 // Construct fake HTML for parsing:
417 $this->conf['content'] = '
418 <html>
419 <head>
420 <title>' . htmlspecialchars($title) . '</title>
421 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
422 <meta name="description" content="' . htmlspecialchars($description) . '" />
423 </head>
424 <body>
425 ' . htmlspecialchars($content) . '
426 </body>
427 </html>';
428 // Content string (HTML of TYPO3 page)
429 // Initializing charset:
430 $this->conf['metaCharset'] = $charset;
431 // Character set of content (will be converted to utf-8 during indexing)
432 $this->conf['indexedDocTitle'] = '';
433 // Alternative title for indexing
434 // Index content as if it was a TYPO3 page:
435 $this->indexTypo3PageContent();
436 }
437
438 /********************************
439 *
440 * Initialization
441 *
442 *******************************/
443 /**
444 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
445 *
446 * @return void
447 */
448 public function init()
449 {
450 // Initializing:
451 $this->cHashParams = $this->conf['cHash_array'];
452 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
453 if ($this->conf['cHash']) {
454 // Add this so that URL's come out right...
455 $this->cHashParams['cHash'] = $this->conf['cHash'];
456 }
457 unset($this->cHashParams['encryptionKey']);
458 }
459 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
460 $this->setT3Hashes();
461 // Indexer configuration from Extension Manager interface:
462 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
463 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
464 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
465 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
466 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
467 // Workaround: If the extension configuration was not updated yet, the value is not existing
468 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
469 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
470 // Initialize external document parsers:
471 // Example configuration, see ext_localconf.php of this file!
472 if ($this->conf['index_externals']) {
473 $this->initializeExternalParsers();
474 }
475 // Initialize lexer (class that deconstructs the text into words):
476 $lexerObjRef = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
477 $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
478 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
479 // Initialize metaphone hook:
480 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
481 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
482 $this->metaphoneObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
483 $this->metaphoneObj->pObj = $this;
484 }
485 // Init charset class:
486 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
487 }
488
489 /**
490 * Initialize external parsers
491 *
492 * @return void
493 * @access private
494 * @see init()
495 */
496 public function initializeExternalParsers()
497 {
498 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'])) {
499 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
500 $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
501 $this->external_parsers[$extension]->pObj = $this;
502 // Init parser and if it returns FALSE, unset its entry again:
503 if (!$this->external_parsers[$extension]->initParser($extension)) {
504 unset($this->external_parsers[$extension]);
505 }
506 }
507 }
508 }
509
510 /********************************
511 *
512 * Indexing; TYPO3 pages (HTML content)
513 *
514 *******************************/
515 /**
516 * Start indexing of the TYPO3 page
517 *
518 * @return void
519 */
520 public function indexTypo3PageContent()
521 {
522 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
523 $is_grlist = $this->is_grlist_set($this->hash['phash']);
524 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
525 // Setting message:
526 if ($this->forceIndexing) {
527 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
528 } elseif ($check > 0) {
529 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
530 } else {
531 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
532 }
533 // Divide into title,keywords,description and body:
534 $this->log_push('Split content', '');
535 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
536 if ($this->conf['indexedDocTitle']) {
537 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
538 }
539 $this->log_pull();
540 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
541 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
542 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
543 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
544 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
545 $checkCHash = $this->checkContentHash();
546 if (!is_array($checkCHash) || $check === 1) {
547 $Pstart = GeneralUtility::milliseconds();
548 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
549 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
550 $this->log_pull();
551 // Splitting words
552 $this->log_push('Extract words from content', '');
553 $splitInWords = $this->processWordsInArrays($this->contentParts);
554 $this->log_pull();
555 // Analyse the indexed words.
556 $this->log_push('Analyse the extracted words', '');
557 $indexArr = $this->indexAnalyze($splitInWords);
558 $this->log_pull();
559 // Submitting page (phash) record
560 $this->log_push('Submitting page', '');
561 $this->submitPage();
562 $this->log_pull();
563 // Check words and submit to word list if not there
564 $this->log_push('Check word list and submit words', '');
565 if (IndexedSearchUtility::isTableUsed('index_words')) {
566 $this->checkWordList($indexArr);
567 $this->submitWords($indexArr, $this->hash['phash']);
568 }
569 $this->log_pull();
570 // Set parsetime
571 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
572 // Checking external files if configured for.
573 $this->log_push('Checking external files', '');
574 if ($this->conf['index_externals']) {
575 $this->extractLinks($this->conf['content']);
576 }
577 $this->log_pull();
578 } else {
579 // Update the timestamp
580 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
581 $this->updateSetId($this->hash['phash']);
582 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
583 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
584 $this->updateRootline();
585 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
586 }
587 } else {
588 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
589 }
590 }
591
592 /**
593 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
594 *
595 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
596 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
597 * @see splitRegularContent()
598 */
599 public function splitHTMLContent($content)
600 {
601 // divide head from body ( u-ouh :) )
602 $contentArr = $this->defaultContentArray;
603 $contentArr['body'] = stristr($content, '<body');
604 $headPart = substr($content, 0, -strlen($contentArr['body']));
605 // get title
606 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
607 $titleParts = explode(':', $contentArr['title'], 2);
608 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
609 // get keywords and description metatags
610 if ($this->conf['index_metatags']) {
611 $meta = array();
612 $i = 0;
613 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
614 $i++;
615 }
616 // @todo The code below stops at first unset tag. Is that correct?
617 for ($i = 0; isset($meta[$i]); $i++) {
618 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
619 if (stristr($meta[$i]['name'], 'keywords')) {
620 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
621 }
622 if (stristr($meta[$i]['name'], 'description')) {
623 $contentArr['description'] .= ',' . $meta[$i]['content'];
624 }
625 }
626 }
627 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
628 $this->typoSearchTags($contentArr['body']);
629 // Get rid of unwanted sections (ie. scripting and style stuff) in body
630 $tagList = explode(',', $this->excludeSections);
631 foreach ($tagList as $tag) {
632 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
633 }
634 }
635 // remove tags, but first make sure we don't concatenate words by doing it
636 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
637 $contentArr['body'] = trim(strip_tags($contentArr['body']));
638 $contentArr['keywords'] = trim($contentArr['keywords']);
639 $contentArr['description'] = trim($contentArr['description']);
640 // Return array
641 return $contentArr;
642 }
643
644 /**
645 * Extract the charset value from HTML meta tag.
646 *
647 * @param string $content HTML content
648 * @return string The charset value if found.
649 */
650 public function getHTMLcharset($content)
651 {
652 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
653 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
654 return $reg2[1];
655 }
656 }
657 }
658
659 /**
660 * Converts a HTML document to utf-8
661 *
662 * @param string $content HTML content, any charset
663 * @param string $charset Optional charset (otherwise extracted from HTML)
664 * @return string Converted HTML
665 */
666 public function convertHTMLToUtf8($content, $charset = '')
667 {
668 // Find charset:
669 $charset = $charset ?: $this->getHTMLcharset($content);
670 $charset = $this->csObj->parse_charset($charset);
671 // Convert charset:
672 if ($charset && $charset !== 'utf-8') {
673 $content = $this->csObj->conv($content, $charset, 'utf-8');
674 }
675 // Convert entities, assuming document is now UTF-8:
676 return $this->csObj->entities_to_utf8($content, true);
677 }
678
679 /**
680 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
681 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
682 * <title> of document or removing <script>-sections
683 *
684 * @param string $string String to search in
685 * @param string $tagName Tag name, eg. "script
686 * @param string $tagContent Passed by reference: Content inside found tag
687 * @param string $stringAfter Passed by reference: Content after found tag
688 * @param string $paramList Passed by reference: Attributes of the found tag.
689 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
690 */
691 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
692 {
693 $endTag = '</' . $tagName . '>';
694 $startTag = '<' . $tagName;
695 // stristr used because we want a case-insensitive search for the tag.
696 $isTagInText = stristr($string, $startTag);
697 // if the tag was not found, return FALSE
698 if (!$isTagInText) {
699 return false;
700 }
701 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
702 $afterTagInText = stristr($isTagInText, $endTag);
703 if ($afterTagInText) {
704 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
705 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
706 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
707 } else {
708 $tagContent = '';
709 $stringAfter = $isTagInText;
710 }
711 return true;
712 }
713
714 /**
715 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
716 *
717 * @param string $body HTML Content, passed by reference
718 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
719 */
720 public function typoSearchTags(&$body)
721 {
722 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
723 if (count($expBody) > 1) {
724 $body = '';
725 foreach ($expBody as $val) {
726 $part = explode('-->', $val, 2);
727 if (trim($part[0]) == 'begin') {
728 $body .= $part[1];
729 $prev = '';
730 } elseif (trim($part[0]) == 'end') {
731 $body .= $prev;
732 } else {
733 $prev = $val;
734 }
735 }
736 return true;
737 } else {
738 return false;
739 }
740 }
741
742 /**
743 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
744 *
745 * @param string $content HTML content
746 * @return void
747 */
748 public function extractLinks($content)
749 {
750 // Get links:
751 $list = $this->extractHyperLinks($content);
752 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
753 $this->includeCrawlerClass();
754 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
755 }
756 // Traverse links:
757 foreach ($list as $linkInfo) {
758 // Decode entities:
759 if ($linkInfo['localPath']) {
760 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
761 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
762 } else {
763 $linkSource = htmlspecialchars_decode($linkInfo['href']);
764 }
765 // Parse URL:
766 $qParts = parse_url($linkSource);
767 // Check for jumpurl (TYPO3 specific thing...)
768 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
769 parse_str($qParts['query'], $getP);
770 $linkSource = $getP['jumpurl'];
771 $qParts = parse_url($linkSource);
772 }
773 if (!$linkInfo['localPath'] && $qParts['scheme']) {
774 if ($this->indexerConfig['indexExternalURLs']) {
775 // Index external URL (http or otherwise)
776 $this->indexExternalUrl($linkSource);
777 }
778 } elseif (!$qParts['query']) {
779 $linkSource = urldecode($linkSource);
780 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
781 $localFile = $linkSource;
782 } else {
783 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
784 }
785 if ($localFile && @is_file($localFile)) {
786 // Index local file:
787 if ($linkInfo['localPath']) {
788 $fI = pathinfo($linkSource);
789 $ext = strtolower($fI['extension']);
790 if (is_object($crawler)) {
791 $params = array(
792 'document' => $linkSource,
793 'alturl' => $linkInfo['href'],
794 'conf' => $this->conf
795 );
796 unset($params['conf']['content']);
797 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
798 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
799 } else {
800 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
801 }
802 } else {
803 if (is_object($crawler)) {
804 $params = array(
805 'document' => $linkSource,
806 'conf' => $this->conf
807 );
808 unset($params['conf']['content']);
809 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
810 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
811 } else {
812 $this->indexRegularDocument($linkSource);
813 }
814 }
815 }
816 }
817 }
818 }
819
820 /**
821 * Extracts all links to external documents from the HTML content string
822 *
823 * @param string $html
824 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
825 * @see extractLinks()
826 */
827 public function extractHyperLinks($html)
828 {
829 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
830 $htmlParts = $htmlParser->splitTags('a', $html);
831 $hyperLinksData = array();
832 foreach ($htmlParts as $index => $tagData) {
833 if ($index % 2 !== 0) {
834 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
835 $firstTagName = $htmlParser->getFirstTagName($tagData);
836 if (strtolower($firstTagName) === 'a') {
837 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
838 $hyperLinksData[] = array(
839 'tag' => $tagData,
840 'href' => $tagAttributes[0]['href'],
841 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
842 );
843 }
844 }
845 }
846 }
847 return $hyperLinksData;
848 }
849
850 /**
851 * Extracts the "base href" from content string.
852 *
853 * @param string $html Content to analyze
854 * @return string The base href or an empty string if not found
855 */
856 public function extractBaseHref($html)
857 {
858 $href = '';
859 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
860 $htmlParts = $htmlParser->splitTags('base', $html);
861 foreach ($htmlParts as $index => $tagData) {
862 if ($index % 2 !== 0) {
863 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
864 $firstTagName = $htmlParser->getFirstTagName($tagData);
865 if (strtolower($firstTagName) === 'base') {
866 $href = $tagAttributes[0]['href'];
867 if ($href) {
868 break;
869 }
870 }
871 }
872 }
873 return $href;
874 }
875
876 /******************************************
877 *
878 * Indexing; external URL
879 *
880 ******************************************/
881 /**
882 * Index External URLs HTML content
883 *
884 * @param string $externalUrl URL, eg. "http://typo3.org/
885 * @return void
886 * @see indexRegularDocument()
887 */
888 public function indexExternalUrl($externalUrl)
889 {
890 // Parse External URL:
891 $qParts = parse_url($externalUrl);
892 $fI = pathinfo($qParts['path']);
893 $ext = strtolower($fI['extension']);
894 // Get headers:
895 $urlHeaders = $this->getUrlHeaders($externalUrl);
896 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
897 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
898 if ((string)$content !== '') {
899 // Create temporary file:
900 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
901 if ($tmpFile) {
902 GeneralUtility::writeFile($tmpFile, $content);
903 // Index that file:
904 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
905 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
906 unlink($tmpFile);
907 }
908 }
909 }
910 }
911
912 /**
913 * Getting HTTP request headers of URL
914 *
915 * @param string $url The URL
916 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
917 */
918 public function getUrlHeaders($url)
919 {
920 // Try to get the headers only
921 $content = GeneralUtility::getUrl($url, 2);
922 if ((string)$content !== '') {
923 // Compile headers:
924 $headers = GeneralUtility::trimExplode(LF, $content, true);
925 $retVal = array();
926 foreach ($headers as $line) {
927 if (trim($line) === '') {
928 break;
929 }
930 list($headKey, $headValue) = explode(':', $line, 2);
931 $retVal[$headKey] = $headValue;
932 }
933 return $retVal;
934 }
935 }
936
937 /**
938 * Checks if the file is local
939 *
940 * @param string $sourcePath
941 * @return string Absolute path to file if file is local, else empty string
942 */
943 protected function createLocalPath($sourcePath)
944 {
945 $localPath = '';
946 static $pathFunctions = array(
947 'createLocalPathFromT3vars',
948 'createLocalPathUsingAbsRefPrefix',
949 'createLocalPathUsingDomainURL',
950 'createLocalPathFromAbsoluteURL',
951 'createLocalPathFromRelativeURL'
952 );
953 foreach ($pathFunctions as $functionName) {
954 $localPath = $this->{$functionName}($sourcePath);
955 if ($localPath != '') {
956 break;
957 }
958 }
959 return $localPath;
960 }
961
962 /**
963 * Attempts to create a local file path from T3VARs. This is useful for
964 * various download extensions that hide actual file name but still want the
965 * file to be indexed.
966 *
967 * @param string $sourcePath
968 * @return string
969 */
970 protected function createLocalPathFromT3vars($sourcePath)
971 {
972 $localPath = '';
973 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
974 if (is_array($indexLocalFiles)) {
975 $md5 = GeneralUtility::shortMD5($sourcePath);
976 // Note: not using self::isAllowedLocalFile here because this method
977 // is allowed to index files outside of the web site (for example,
978 // protected downloads)
979 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
980 $localPath = $indexLocalFiles[$md5];
981 }
982 }
983 return $localPath;
984 }
985
986 /**
987 * Attempts to create a local file path by matching a current request URL.
988 *
989 * @param string $sourcePath
990 * @return string
991 */
992 protected function createLocalPathUsingDomainURL($sourcePath)
993 {
994 $localPath = '';
995 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
996 $baseURLLength = strlen($baseURL);
997 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
998 $sourcePath = substr($sourcePath, $baseURLLength);
999 $localPath = PATH_site . $sourcePath;
1000 if (!self::isAllowedLocalFile($localPath)) {
1001 $localPath = '';
1002 }
1003 }
1004 return $localPath;
1005 }
1006
1007 /**
1008 * Attempts to create a local file path by matching absRefPrefix. This
1009 * requires TSFE. If TSFE is missing, this function does nothing.
1010 *
1011 * @param string $sourcePath
1012 * @return string
1013 */
1014 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1015 {
1016 $localPath = '';
1017 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1018 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1019 $absRefPrefixLength = strlen($absRefPrefix);
1020 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1021 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1022 $localPath = PATH_site . $sourcePath;
1023 if (!self::isAllowedLocalFile($localPath)) {
1024 $localPath = '';
1025 }
1026 }
1027 }
1028 return $localPath;
1029 }
1030
1031 /**
1032 * Attempts to create a local file path from the absolute URL without
1033 * schema.
1034 *
1035 * @param string $sourcePath
1036 * @return string
1037 */
1038 protected function createLocalPathFromAbsoluteURL($sourcePath)
1039 {
1040 $localPath = '';
1041 if ($sourcePath[0] == '/') {
1042 $sourcePath = substr($sourcePath, 1);
1043 $localPath = PATH_site . $sourcePath;
1044 if (!self::isAllowedLocalFile($localPath)) {
1045 $localPath = '';
1046 }
1047 }
1048 return $localPath;
1049 }
1050
1051 /**
1052 * Attempts to create a local file path from the relative URL.
1053 *
1054 * @param string $sourcePath
1055 * @return string
1056 */
1057 protected function createLocalPathFromRelativeURL($sourcePath)
1058 {
1059 $localPath = '';
1060 if (self::isRelativeURL($sourcePath)) {
1061 $localPath = PATH_site . $sourcePath;
1062 if (!self::isAllowedLocalFile($localPath)) {
1063 $localPath = '';
1064 }
1065 }
1066 return $localPath;
1067 }
1068
1069 /**
1070 * Checks if URL is relative.
1071 *
1072 * @param string $url
1073 * @return bool
1074 */
1075 protected static function isRelativeURL($url)
1076 {
1077 $urlParts = @parse_url($url);
1078 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1079 }
1080
1081 /**
1082 * Checks if the path points to the file inside the web site
1083 *
1084 * @param string $filePath
1085 * @return bool
1086 */
1087 protected static function isAllowedLocalFile($filePath)
1088 {
1089 $filePath = GeneralUtility::resolveBackPath($filePath);
1090 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1091 $isFile = is_file($filePath);
1092 return $insideWebPath && $isFile;
1093 }
1094
1095 /******************************************
1096 *
1097 * Indexing; external files (PDF, DOC, etc)
1098 *
1099 ******************************************/
1100 /**
1101 * Indexing a regular document given as $file (relative to PATH_site, local file)
1102 *
1103 * @param string $file Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1104 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1105 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1106 * @param string $altExtension File extension for temporary file.
1107 * @return void
1108 */
1109 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1110 {
1111 // Init
1112 $fI = pathinfo($file);
1113 $ext = $altExtension ?: strtolower($fI['extension']);
1114 // Create abs-path:
1115 if (!$contentTmpFile) {
1116 if (!GeneralUtility::isAbsPath($file)) {
1117 // Relative, prepend PATH_site:
1118 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1119 } else {
1120 // Absolute, pass-through:
1121 $absFile = $file;
1122 }
1123 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1124 } else {
1125 $absFile = $contentTmpFile;
1126 }
1127 // Indexing the document:
1128 if ($absFile && @is_file($absFile)) {
1129 if ($this->external_parsers[$ext]) {
1130 $fileInfo = stat($absFile);
1131 $cParts = $this->fileContentParts($ext, $absFile);
1132 foreach ($cParts as $cPKey) {
1133 $this->internal_log = array();
1134 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1135 $Pstart = GeneralUtility::milliseconds();
1136 $subinfo = array('key' => $cPKey);
1137 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1138 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1139 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1140 if ($check > 0 || $force) {
1141 if ($check > 0) {
1142 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1143 } else {
1144 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1145 }
1146 // Check external file counter:
1147 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1148 // Divide into title,keywords,description and body:
1149 $this->log_push('Split content', '');
1150 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1151 $this->log_pull();
1152 if (is_array($contentParts)) {
1153 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1154 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1155 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1156 // Increment counter:
1157 $this->externalFileCounter++;
1158 // Splitting words
1159 $this->log_push('Extract words from content', '');
1160 $splitInWords = $this->processWordsInArrays($contentParts);
1161 $this->log_pull();
1162 // Analyse the indexed words.
1163 $this->log_push('Analyse the extracted words', '');
1164 $indexArr = $this->indexAnalyze($splitInWords);
1165 $this->log_pull();
1166 // Submitting page (phash) record
1167 $this->log_push('Submitting page', '');
1168 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1169 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1170 $this->log_pull();
1171 // Check words and submit to word list if not there
1172 $this->log_push('Check word list and submit words', '');
1173 if (IndexedSearchUtility::isTableUsed('index_words')) {
1174 $this->checkWordList($indexArr);
1175 $this->submitWords($indexArr, $phash_arr['phash']);
1176 }
1177 $this->log_pull();
1178 // Set parsetime
1179 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1180 } else {
1181 // Update the timestamp
1182 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1183 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1184 }
1185 } else {
1186 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1187 }
1188 } else {
1189 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1190 }
1191 } else {
1192 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1193 }
1194 // Checking and setting sections:
1195 $this->submitFile_section($phash_arr['phash']);
1196 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1197 $this->log_pull();
1198 }
1199 } else {
1200 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1201 }
1202 } else {
1203 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1204 }
1205 }
1206
1207 /**
1208 * Reads the content of an external file being indexed.
1209 * The content from the external parser MUST be returned in utf-8!
1210 *
1211 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1212 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1213 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1214 * @return array Standard content array (title, description, keywords, body keys)
1215 */
1216 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1217 {
1218 $contentArray = null;
1219 // Consult relevant external document parser:
1220 if (is_object($this->external_parsers[$fileExtension])) {
1221 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1222 }
1223 return $contentArray;
1224 }
1225
1226 /**
1227 * Creates an array with pointers to divisions of document.
1228 *
1229 * @param string $ext File extension
1230 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1231 * @return array Array of pointers to sections that the document should be divided into
1232 */
1233 public function fileContentParts($ext, $absFile)
1234 {
1235 $cParts = array(0);
1236 // Consult relevant external document parser:
1237 if (is_object($this->external_parsers[$ext])) {
1238 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1239 }
1240 return $cParts;
1241 }
1242
1243 /**
1244 * Splits non-HTML content (from external files for instance)
1245 *
1246 * @param string $content Input content (non-HTML) to index.
1247 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1248 * @see splitHTMLContent()
1249 */
1250 public function splitRegularContent($content)
1251 {
1252 $contentArr = $this->defaultContentArray;
1253 $contentArr['body'] = $content;
1254 return $contentArr;
1255 }
1256
1257 /**********************************
1258 *
1259 * Analysing content, Extracting words
1260 *
1261 **********************************/
1262 /**
1263 * Convert character set and HTML entities in the value of input content array keys
1264 *
1265 * @param array $contentArr Standard content array
1266 * @param string $charset Charset of the input content (converted to utf-8)
1267 * @return void
1268 */
1269 public function charsetEntity2utf8(&$contentArr, $charset)
1270 {
1271 // Convert charset if necessary
1272 foreach ($contentArr as $key => $value) {
1273 if ((string)$contentArr[$key] !== '') {
1274 if ($charset !== 'utf-8') {
1275 $contentArr[$key] = $this->csObj->conv($contentArr[$key], $charset, 'utf-8');
1276 }
1277 // decode all numeric / html-entities in the string to real characters:
1278 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], true);
1279 }
1280 }
1281 }
1282
1283 /**
1284 * Processing words in the array from split*Content -functions
1285 *
1286 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1287 * @return array Content input array modified so each key is not a unique array of words
1288 */
1289 public function processWordsInArrays($contentArr)
1290 {
1291 // split all parts to words
1292 foreach ($contentArr as $key => $value) {
1293 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1294 }
1295 // For title, keywords, and description we don't want duplicates:
1296 $contentArr['title'] = array_unique($contentArr['title']);
1297 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1298 $contentArr['description'] = array_unique($contentArr['description']);
1299 // Return modified array:
1300 return $contentArr;
1301 }
1302
1303 /**
1304 * Extracts the sample description text from the content array.
1305 *
1306 * @param array $contentArr Content array
1307 * @return string Description string
1308 */
1309 public function bodyDescription($contentArr)
1310 {
1311 // Setting description
1312 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1313 if ($maxL) {
1314 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1315 // Shorten the string:
1316 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1317 }
1318 return $bodyDescription;
1319 }
1320
1321 /**
1322 * Analyzes content to use for indexing,
1323 *
1324 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1325 * @return array Index Array (whatever that is...)
1326 */
1327 public function indexAnalyze($content)
1328 {
1329 $indexArr = array();
1330 $counter = 0;
1331 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1332 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1333 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1334 $this->analyzeBody($indexArr, $content);
1335 return $indexArr;
1336 }
1337
1338 /**
1339 * Calculates relevant information for headercontent
1340 *
1341 * @param array $retArr Index array, passed by reference
1342 * @param array $content Standard content array
1343 * @param string $key Key from standard content array
1344 * @param int $offset Bit-wise priority to type
1345 * @return void
1346 */
1347 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1348 {
1349 foreach ($content[$key] as $val) {
1350 $val = substr($val, 0, 60);
1351 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1352 if (!isset($retArr[$val])) {
1353 // Word ID (wid)
1354 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1355 // Metaphone value is also 60 only chars long
1356 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1357 $retArr[$val]['metaphone'] = $metaphone;
1358 }
1359 // Build metaphone fulltext string (can be used for fulltext indexing)
1360 if ($this->storeMetaphoneInfoAsWords) {
1361 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1362 }
1363 // Priority used for flagBitMask feature (see extension configuration)
1364 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1365 // Increase number of occurrences
1366 $retArr[$val]['count']++;
1367 $this->wordcount++;
1368 }
1369 }
1370
1371 /**
1372 * Calculates relevant information for bodycontent
1373 *
1374 * @param array $retArr Index array, passed by reference
1375 * @param array $content Standard content array
1376 * @return void
1377 */
1378 public function analyzeBody(&$retArr, $content)
1379 {
1380 foreach ($content['body'] as $key => $val) {
1381 $val = substr($val, 0, 60);
1382 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1383 if (!isset($retArr[$val])) {
1384 // First occurrence (used for ranking results)
1385 $retArr[$val]['first'] = $key;
1386 // Word ID (wid)
1387 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1388 // Metaphone value is also only 60 chars long
1389 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1390 $retArr[$val]['metaphone'] = $metaphone;
1391 }
1392 // Build metaphone fulltext string (can be used for fulltext indexing)
1393 if ($this->storeMetaphoneInfoAsWords) {
1394 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1395 }
1396 // Increase number of occurrences
1397 $retArr[$val]['count']++;
1398 $this->wordcount++;
1399 }
1400 }
1401
1402 /**
1403 * Creating metaphone based hash from input word
1404 *
1405 * @param string $word Word to convert
1406 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1407 * @return mixed Metaphone hash integer (or raw value, string)
1408 */
1409 public function metaphone($word, $returnRawMetaphoneValue = false)
1410 {
1411 if (is_object($this->metaphoneObj)) {
1412 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1413 } else {
1414 // Use native PHP function instead of advanced doubleMetaphone class
1415 $metaphoneRawValue = metaphone($word);
1416 }
1417 if ($returnRawMetaphoneValue) {
1418 $result = $metaphoneRawValue;
1419 } elseif ($metaphoneRawValue !== '') {
1420 // Create hash and return integer
1421 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1422 } else {
1423 $result = 0;
1424 }
1425 return $result;
1426 }
1427
1428 /********************************
1429 *
1430 * SQL; TYPO3 Pages
1431 *
1432 *******************************/
1433 /**
1434 * Updates db with information about the page (TYPO3 page, not external media)
1435 *
1436 * @return void
1437 */
1438 public function submitPage()
1439 {
1440 // Remove any current data for this phash:
1441 $this->removeOldIndexedPages($this->hash['phash']);
1442 // setting new phash_row
1443 $fields = array(
1444 'phash' => $this->hash['phash'],
1445 'phash_grouping' => $this->hash['phash_grouping'],
1446 'cHashParams' => serialize($this->cHashParams),
1447 'contentHash' => $this->content_md5h,
1448 'data_page_id' => $this->conf['id'],
1449 'data_page_reg1' => $this->conf['page_cache_reg1'],
1450 'data_page_type' => $this->conf['type'],
1451 'data_page_mp' => $this->conf['MP'],
1452 'gr_list' => $this->conf['gr_list'],
1453 'item_type' => 0,
1454 // TYPO3 page
1455 'item_title' => $this->contentParts['title'],
1456 'item_description' => $this->bodyDescription($this->contentParts),
1457 'item_mtime' => (int)$this->conf['mtime'],
1458 'item_size' => strlen($this->conf['content']),
1459 'tstamp' => $GLOBALS['EXEC_TIME'],
1460 'crdate' => $GLOBALS['EXEC_TIME'],
1461 'item_crdate' => $this->conf['crdate'],
1462 // Creation date of page
1463 'sys_language_uid' => $this->conf['sys_language_uid'],
1464 // Sys language uid of the page. Should reflect which language it DOES actually display!
1465 'externalUrl' => 0,
1466 'recordUid' => (int)$this->conf['recordUid'],
1467 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1468 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1469 );
1470 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1471 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1472 }
1473 // PROCESSING index_section
1474 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1475 // PROCESSING index_grlist
1476 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1477 // PROCESSING index_fulltext
1478 $fields = array(
1479 'phash' => $this->hash['phash'],
1480 'fulltextdata' => implode(' ', $this->contentParts),
1481 'metaphonedata' => $this->metaphoneContent
1482 );
1483 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1484 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1485 }
1486 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1487 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1488 }
1489 // PROCESSING index_debug
1490 if ($this->indexerConfig['debugMode']) {
1491 $fields = array(
1492 'phash' => $this->hash['phash'],
1493 'debuginfo' => serialize(array(
1494 'cHashParams' => $this->cHashParams,
1495 'external_parsers initialized' => array_keys($this->external_parsers),
1496 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1497 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1498 'logs' => $this->internal_log,
1499 'lexer' => $this->lexerObj->debugString
1500 ))
1501 );
1502 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1503 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1504 }
1505 }
1506 }
1507
1508 /**
1509 * Stores gr_list in the database.
1510 *
1511 * @param int $hash Search result record phash
1512 * @param int $phash_x Actual phash of current content
1513 * @return void
1514 * @see update_grlist()
1515 */
1516 public function submit_grlist($hash, $phash_x)
1517 {
1518 // Setting the gr_list record
1519 $fields = array(
1520 'phash' => $hash,
1521 'phash_x' => $phash_x,
1522 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1523 'gr_list' => $this->conf['gr_list']
1524 );
1525 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1526 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1527 }
1528 }
1529
1530 /**
1531 * Stores section
1532 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1533 *
1534 * @param int $hash phash of TYPO3 parent search result record
1535 * @param int $hash_t3 phash of the file indexation search record
1536 * @return void
1537 */
1538 public function submit_section($hash, $hash_t3)
1539 {
1540 $fields = array(
1541 'phash' => $hash,
1542 'phash_t3' => $hash_t3,
1543 'page_id' => (int)$this->conf['id']
1544 );
1545 $this->getRootLineFields($fields);
1546 if (IndexedSearchUtility::isTableUsed('index_section')) {
1547 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1548 }
1549 }
1550
1551 /**
1552 * Removes records for the indexed page, $phash
1553 *
1554 * @param int $phash phash value to flush
1555 * @return void
1556 */
1557 public function removeOldIndexedPages($phash)
1558 {
1559 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1560 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1561 foreach ($tableArray as $table) {
1562 if (IndexedSearchUtility::isTableUsed($table)) {
1563 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1564 }
1565 }
1566 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1567 if (IndexedSearchUtility::isTableUsed('index_section')) {
1568 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . (int)$phash);
1569 }
1570 }
1571
1572 /********************************
1573 *
1574 * SQL; External media
1575 *
1576 *******************************/
1577 /**
1578 * Updates db with information about the file
1579 *
1580 * @param array $hash Array with phash and phash_grouping keys for file
1581 * @param string $file File name
1582 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1583 * @param string $ext File extension determining the type of media.
1584 * @param int $mtime Modification time of file.
1585 * @param int $ctime Creation time of file.
1586 * @param int $size Size of file in bytes
1587 * @param int $content_md5h Content HASH value.
1588 * @param array $contentParts Standard content array (using only title and body for a file)
1589 * @return void
1590 */
1591 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1592 {
1593 // Find item Type:
1594 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1595 $storeItemType = $storeItemType ?: $ext;
1596 // Remove any current data for this phash:
1597 $this->removeOldIndexedFiles($hash['phash']);
1598 // Split filename:
1599 $fileParts = parse_url($file);
1600 // Setting new
1601 $fields = array(
1602 'phash' => $hash['phash'],
1603 'phash_grouping' => $hash['phash_grouping'],
1604 'cHashParams' => serialize($subinfo),
1605 'contentHash' => $content_md5h,
1606 'data_filename' => $file,
1607 'item_type' => $storeItemType,
1608 'item_title' => trim($contentParts['title']) ?: basename($file),
1609 'item_description' => $this->bodyDescription($contentParts),
1610 'item_mtime' => $mtime,
1611 'item_size' => $size,
1612 'item_crdate' => $ctime,
1613 'tstamp' => $GLOBALS['EXEC_TIME'],
1614 'crdate' => $GLOBALS['EXEC_TIME'],
1615 'gr_list' => $this->conf['gr_list'],
1616 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1617 'recordUid' => (int)$this->conf['recordUid'],
1618 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1619 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1620 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1621 );
1622 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1623 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1624 }
1625 // PROCESSING index_fulltext
1626 $fields = array(
1627 'phash' => $hash['phash'],
1628 'fulltextdata' => implode(' ', $contentParts),
1629 'metaphonedata' => $this->metaphoneContent
1630 );
1631 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1632 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1633 }
1634 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1635 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1636 }
1637 // PROCESSING index_debug
1638 if ($this->indexerConfig['debugMode']) {
1639 $fields = array(
1640 'phash' => $hash['phash'],
1641 'debuginfo' => serialize(array(
1642 'cHashParams' => $subinfo,
1643 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1644 'logs' => $this->internal_log,
1645 'lexer' => $this->lexerObj->debugString
1646 ))
1647 );
1648 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1649 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1650 }
1651 }
1652 }
1653
1654 /**
1655 * Stores file gr_list for a file IF it does not exist already
1656 *
1657 * @param int $hash phash value of file
1658 * @return void
1659 */
1660 public function submitFile_grlist($hash)
1661 {
1662 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1663 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1664 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$hash . ' AND (hash_gr_list=' . IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1665 if ($count == 0) {
1666 $this->submit_grlist($hash, $hash);
1667 }
1668 }
1669 }
1670
1671 /**
1672 * Stores file section for a file IF it does not exist
1673 *
1674 * @param int $hash phash value of file
1675 * @return void
1676 */
1677 public function submitFile_section($hash)
1678 {
1679 // Testing if there is already a section
1680 if (IndexedSearchUtility::isTableUsed('index_section')) {
1681 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . (int)$hash . ' AND page_id=' . (int)$this->conf['id']);
1682 if ($count == 0) {
1683 $this->submit_section($hash, $this->hash['phash']);
1684 }
1685 }
1686 }
1687
1688 /**
1689 * Removes records for the indexed page, $phash
1690 *
1691 * @param int $phash phash value to flush
1692 * @return void
1693 */
1694 public function removeOldIndexedFiles($phash)
1695 {
1696 // Removing old registrations for tables.
1697 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1698 foreach ($tableArray as $table) {
1699 if (IndexedSearchUtility::isTableUsed($table)) {
1700 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1701 }
1702 }
1703 }
1704
1705 /********************************
1706 *
1707 * SQL Helper functions
1708 *
1709 *******************************/
1710 /**
1711 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1712 * Return positive integer if the page needs to be indexed
1713 *
1714 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1715 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1716 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1717 */
1718 public function checkMtimeTstamp($mtime, $phash)
1719 {
1720 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1721 // Not indexed (not in index_phash)
1722 $result = 4;
1723 } else {
1724 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . (int)$phash);
1725 // If there was an indexing of the page...:
1726 if ($row) {
1727 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1728 // If max age is exceeded, index the page
1729 // The configured max-age was exceeded for the document and thus it's indexed.
1730 $result = 1;
1731 } else {
1732 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1733 // if minAge is not set or if minAge is exceeded, consider at mtime
1734 if ($mtime) {
1735 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1736 if ($row['item_mtime'] != $mtime) {
1737 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1738 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1739 $result = 2;
1740 } else {
1741 // mtime matched the document, so no changes detected and no content updated
1742 $result = -1;
1743 if ($this->tstamp_maxAge) {
1744 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1745 } else {
1746 $this->updateTstamp($phash);
1747 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1748 }
1749 }
1750 } else {
1751 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1752 $result = 3;
1753 }
1754 } else {
1755 // The minimum age was not exceeded
1756 $result = -2;
1757 }
1758 }
1759 } else {
1760 // Page has never been indexed (is not represented in the index_phash table).
1761 $result = 4;
1762 }
1763 }
1764 return $result;
1765 }
1766
1767 /**
1768 * Check content hash in phash table
1769 *
1770 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1771 */
1772 public function checkContentHash()
1773 {
1774 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1775 $result = true;
1776 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1777 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . (int)$this->hash['phash_grouping'] . ' AND contentHash=' . (int)$this->content_md5h);
1778 if ($row) {
1779 $result = $row;
1780 }
1781 }
1782 return $result;
1783 }
1784
1785 /**
1786 * Check content hash for external documents
1787 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1788 *
1789 * @param int $hashGr phash value to check (phash_grouping)
1790 * @param int $content_md5h Content hash to check
1791 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1792 */
1793 public function checkExternalDocContentHash($hashGr, $content_md5h)
1794 {
1795 $result = true;
1796 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1797 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . (int)$hashGr . ' AND contentHash=' . (int)$content_md5h);
1798 $result = $count == 0;
1799 }
1800 return $result;
1801 }
1802
1803 /**
1804 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1805 *
1806 * @param int $phash_x Phash integer to test.
1807 * @return bool
1808 */
1809 public function is_grlist_set($phash_x)
1810 {
1811 $result = false;
1812 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1813 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . (int)$phash_x);
1814 $result = $count > 0;
1815 }
1816 return $result;
1817 }
1818
1819 /**
1820 * Check if an grlist-entry for this hash exists and if not so, write one.
1821 *
1822 * @param int $phash phash of the search result that should be found
1823 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1824 * @return void
1825 * @see submit_grlist()
1826 */
1827 public function update_grlist($phash, $phash_x)
1828 {
1829 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1830 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$phash . ' AND hash_gr_list=' . IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1831 if ($count == 0) {
1832 $this->submit_grlist($phash, $phash_x);
1833 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1834 }
1835 }
1836 }
1837
1838 /**
1839 * Update tstamp for a phash row.
1840 *
1841 * @param int $phash phash value
1842 * @param int $mtime If set, update the mtime field to this value.
1843 * @return void
1844 */
1845 public function updateTstamp($phash, $mtime = 0)
1846 {
1847 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1848 $updateFields = array(
1849 'tstamp' => $GLOBALS['EXEC_TIME']
1850 );
1851 if ($mtime) {
1852 $updateFields['item_mtime'] = (int)$mtime;
1853 }
1854 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1855 }
1856 }
1857
1858 /**
1859 * Update SetID of the index_phash record.
1860 *
1861 * @param int $phash phash value
1862 * @return void
1863 */
1864 public function updateSetId($phash)
1865 {
1866 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1867 $updateFields = array(
1868 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1869 );
1870 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1871 }
1872 }
1873
1874 /**
1875 * Update parsetime for phash row.
1876 *
1877 * @param int $phash phash value.
1878 * @param int $parsetime Parsetime value to set.
1879 * @return void
1880 */
1881 public function updateParsetime($phash, $parsetime)
1882 {
1883 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1884 $updateFields = array(
1885 'parsetime' => (int)$parsetime
1886 );
1887 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1888 }
1889 }
1890
1891 /**
1892 * Update section rootline for the page
1893 *
1894 * @return void
1895 */
1896 public function updateRootline()
1897 {
1898 if (IndexedSearchUtility::isTableUsed('index_section')) {
1899 $updateFields = array();
1900 $this->getRootLineFields($updateFields);
1901 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . (int)$this->conf['id'], $updateFields);
1902 }
1903 }
1904
1905 /**
1906 * Adding values for root-line fields.
1907 * rl0, rl1 and rl2 are standard. A hook might add more.
1908 *
1909 * @param array $fieldArray Field array, passed by reference
1910 * @return void
1911 */
1912 public function getRootLineFields(array &$fieldArray)
1913 {
1914 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1915 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1916 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1917 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1918 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1919 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1920 }
1921 }
1922 }
1923
1924 /**
1925 * Includes the crawler class
1926 *
1927 * @return void
1928 */
1929 public function includeCrawlerClass()
1930 {
1931 require_once \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php';
1932 }
1933
1934 /********************************
1935 *
1936 * SQL; Submitting words
1937 *
1938 *******************************/
1939 /**
1940 * Adds new words to db
1941 *
1942 * @param array $wordListArray Word List array (where each word has information about position etc).
1943 * @return void
1944 */
1945 public function checkWordList($wordListArray)
1946 {
1947 if (IndexedSearchUtility::isTableUsed('index_words')) {
1948 if (!empty($wordListArray)) {
1949 $phashArray = array();
1950 foreach ($wordListArray as $value) {
1951 $phashArray[] = (int)$value['hash'];
1952 }
1953 $cwl = implode(',', $phashArray);
1954 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1955 $wordListArrayCount = count($wordListArray);
1956 if ($count !== $wordListArrayCount) {
1957 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1958 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
1959 while (false != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1960 unset($wordListArray[$row['baseword']]);
1961 }
1962 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1963 foreach ($wordListArray as $key => $val) {
1964 $insertFields = array(
1965 'wid' => $val['hash'],
1966 'baseword' => $key,
1967 'metaphone' => $val['metaphone']
1968 );
1969 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1970 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1971 }
1972 }
1973 }
1974 }
1975 }
1976
1977 /**
1978 * Submits RELATIONS between words and phash
1979 *
1980 * @param array $wordList Word list array
1981 * @param int $phash phash value
1982 * @return void
1983 */
1984 public function submitWords($wordList, $phash)
1985 {
1986 if (IndexedSearchUtility::isTableUsed('index_rel')) {
1987 $stopWords = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('wid', 'index_words', 'is_stopword != 0', '', '', '', 'wid');
1988
1989 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . (int)$phash);
1990 $fields = array('phash', 'wid', 'count', 'first', 'freq', 'flags');
1991 $rows = array();
1992 foreach ($wordList as $val) {
1993 if (isset($stopWords[$val['hash']])) {
1994 continue;
1995 }
1996 $rows[] = array(
1997 (int)$phash,
1998 (int)$val['hash'],
1999 (int)$val['count'],
2000 (int)$val['first'],
2001 $this->freqMap($val['count'] / $this->wordcount),
2002 $val['cmp'] & $this->flagBitMask
2003 );
2004 }
2005 $GLOBALS['TYPO3_DB']->exec_INSERTmultipleRows('index_rel', $fields, $rows);
2006 }
2007 }
2008
2009 /**
2010 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2011 * and back.
2012 *
2013 * @param float $freq Frequency
2014 * @return int Frequency in range.
2015 */
2016 public function freqMap($freq)
2017 {
2018 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2019 if ($freq <= 1) {
2020 $newFreq = $freq * $mapFactor;
2021 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2022 } else {
2023 $newFreq = $freq / $mapFactor;
2024 }
2025 return $newFreq;
2026 }
2027
2028 /********************************
2029 *
2030 * Hashing
2031 *
2032 *******************************/
2033 /**
2034 * Get search hash, T3 pages
2035 *
2036 * @return void
2037 */
2038 public function setT3Hashes()
2039 {
2040 // Set main array:
2041 $hArray = array(
2042 'id' => (int)$this->conf['id'],
2043 'type' => (int)$this->conf['type'],
2044 'sys_lang' => (int)$this->conf['sys_language_uid'],
2045 'MP' => (string)$this->conf['MP'],
2046 'cHash' => $this->cHashParams
2047 );
2048 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2049 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2050 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2051 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2052 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2053 }
2054
2055 /**
2056 * Get search hash, external files
2057 *
2058 * @param string $file File name / path which identifies it on the server
2059 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2060 * @return array Array with "phash_grouping" and "phash" inside.
2061 */
2062 public function setExtHashes($file, $subinfo = array())
2063 {
2064 // Set main array:
2065 $hash = array();
2066 $hArray = array(
2067 'file' => $file
2068 );
2069 // Set grouping hash:
2070 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2071 // Add subinfo
2072 $hArray['subinfo'] = $subinfo;
2073 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2074 return $hash;
2075 }
2076
2077 /*********************************
2078 *
2079 * Internal logging functions
2080 *
2081 *********************************/
2082 /**
2083 * Push function wrapper for TT logging
2084 *
2085 * @param string $msg Title to set
2086 * @param string $key Key (?)
2087 * @return void
2088 */
2089 public function log_push($msg, $key)
2090 {
2091 if (is_object($GLOBALS['TT'])) {
2092 $GLOBALS['TT']->push($msg, $key);
2093 }
2094 }
2095
2096 /**
2097 * Pull function wrapper for TT logging
2098 *
2099 * @return void
2100 */
2101 public function log_pull()
2102 {
2103 if (is_object($GLOBALS['TT'])) {
2104 $GLOBALS['TT']->pull();
2105 }
2106 }
2107
2108 /**
2109 * Set log message function wrapper for TT logging
2110 *
2111 * @param string $msg Message to set
2112 * @param int $errorNum Error number
2113 * @return void
2114 */
2115 public function log_setTSlogMessage($msg, $errorNum = 0)
2116 {
2117 if (is_object($GLOBALS['TT'])) {
2118 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2119 }
2120 $this->internal_log[] = $msg;
2121 }
2122
2123 /**
2124 * Makes sure that keywords are space-separated. This is impotant for their
2125 * proper displaying as a part of fulltext index.
2126 *
2127 * @param string $keywordList
2128 * @return string
2129 * @see http://forge.typo3.org/issues/14959
2130 */
2131 protected function addSpacesToKeywordList($keywordList)
2132 {
2133 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2134 return ' ' . implode(', ', $keywords) . ' ';
2135 }
2136 }