[CLEANUP] Improve the @param/@return/@var PHPDoc
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
19
20 /**
21 * This class is a search indexer for TYPO3
22 *
23 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
24 */
25 /**
26 * Indexing class for TYPO3 frontend
27 *
28 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
29 */
30 class Indexer {
31
32 // Messages:
33 public $reasons = array(
34 -1 => 'mtime matched the document, so no changes detected and no content updated',
35 -2 => 'The minimum age was not exceeded',
36 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
37 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
38 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
39 4 => 'Page has never been indexed (is not represented in the index_phash table).'
40 );
41
42 // HTML code blocks to exclude from indexing:
43 public $excludeSections = 'script,style';
44
45 // Supported Extensions for external files:
46 public $external_parsers = array();
47
48 // External parser objects, keys are file extension names. Values are objects with certain methods.
49 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
50 public $defaultGrList = '0,-1';
51
52 // Min/Max times:
53 public $tstamp_maxAge = 0;
54
55 // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
56 public $tstamp_minAge = 0;
57
58 // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
59 public $maxExternalFiles = 0;
60
61 // Max number of external files to index.
62 public $forceIndexing = FALSE;
63
64 // If TRUE, indexing is forced despite of hashes etc.
65 public $crawlerActive = FALSE;
66
67 // Set when crawler is detected (internal)
68 // INTERNALS:
69 public $defaultContentArray = array(
70 'title' => '',
71 'description' => '',
72 'keywords' => '',
73 'body' => ''
74 );
75
76 public $wordcount = 0;
77
78 public $externalFileCounter = 0;
79
80 public $conf = array();
81
82 // Configuration set internally (see init functions for required keys and their meaning)
83 public $indexerConfig = array();
84
85 // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
86 public $hash = array();
87
88 // Hash array, contains phash and phash_grouping
89 public $file_phash_arr = array();
90
91 // Hash array for files
92 public $contentParts = array();
93
94 // Content of TYPO3 page
95 public $content_md5h = '';
96
97 public $internal_log = array();
98
99 // Internal log
100 public $indexExternalUrl_content = '';
101
102 public $cHashParams = array();
103
104 // cHashparams array
105 public $freqRange = 32000;
106
107 public $freqMax = 0.1;
108
109 public $enableMetaphoneSearch = FALSE;
110
111 public $storeMetaphoneInfoAsWords;
112
113 public $metaphoneContent = '';
114
115 // Objects:
116 /**
117 * Charset class object
118 *
119 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
120 */
121 public $csObj;
122
123 /**
124 * Metaphone object, if any
125 *
126 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
127 */
128 public $metaphoneObj;
129
130 /**
131 * Lexer object for word splitting
132 *
133 * @var \TYPO3\CMS\IndexedSearch\Lexer
134 */
135 public $lexerObj;
136
137 public $flagBitMask;
138
139 /**
140 * Parent Object (TSFE) Initialization
141 *
142 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
143 * @return void
144 */
145 public function hook_indexContent(&$pObj) {
146 // Indexer configuration from Extension Manager interface:
147 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
148 // Crawler activation:
149 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
150 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
151 // Setting simple log message:
152 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
153 // Setting variables:
154 $this->crawlerActive = TRUE;
155 // Crawler active flag
156 $this->forceIndexing = TRUE;
157 }
158 // Determine if page should be indexed, and if so, configure and initialize indexer
159 if ($pObj->config['config']['index_enable']) {
160 $this->log_push('Index page', '');
161 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
162 if (!$pObj->page['no_search']) {
163 if (!$pObj->no_cache) {
164 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
165 // Setting up internal configuration from config array:
166 $this->conf = array();
167 // Information about page for which the indexing takes place
168 $this->conf['id'] = $pObj->id;
169 // Page id
170 $this->conf['type'] = $pObj->type;
171 // Page type
172 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
173 // sys_language UID of the language of the indexing.
174 $this->conf['MP'] = $pObj->MP;
175 // MP variable, if any (Mount Points)
176 $this->conf['gr_list'] = $pObj->gr_list;
177 // Group list
178 $this->conf['cHash'] = $pObj->cHash;
179 // cHash string for additional parameters
180 $this->conf['cHash_array'] = $pObj->cHash_array;
181 // Array of the additional parameters
182 $this->conf['crdate'] = $pObj->page['crdate'];
183 // The creation date of the TYPO3 page
184 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
185 // reg1 of the caching table. Not known what practical use this has.
186 // Root line uids
187 $this->conf['rootline_uids'] = array();
188 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
189 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
190 }
191 // Content of page:
192 $this->conf['content'] = $pObj->content;
193 // Content string (HTML of TYPO3 page)
194 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
195 // Alternative title for indexing
196 $this->conf['metaCharset'] = $pObj->metaCharset;
197 // Character set of content (will be converted to utf-8 during indexing)
198 $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
199 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
200 // Configuration of behavior:
201 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
202 // Whether to index external documents like PDF, DOC etc. (if possible)
203 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
204 // Length of description text (max 250, default 200)
205 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
206 // Set to zero:
207 $this->conf['recordUid'] = 0;
208 $this->conf['freeIndexUid'] = 0;
209 $this->conf['freeIndexSetId'] = 0;
210 // Init and start indexing:
211 $this->init();
212 $this->indexTypo3PageContent();
213 } else {
214 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
215 }
216 } else {
217 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
218 }
219 } else {
220 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
221 }
222 } else {
223 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
224 }
225 $this->log_pull();
226 }
227 }
228
229 /****************************
230 *
231 * Backend API
232 *
233 ****************************/
234 /**
235 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
236 *
237 * @param int The page uid, &id=
238 * @param int The page type, &type=
239 * @param int sys_language uid, typically &L=
240 * @param string The MP variable (Mount Points), &MP=
241 * @param array Rootline array of only UIDs.
242 * @param array Array of GET variables to register with this indexing
243 * @param bool If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
244 * @return void
245 */
246 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = FALSE) {
247 // Setting up internal configuration from config array:
248 $this->conf = array();
249 // Information about page for which the indexing takes place
250 $this->conf['id'] = $id;
251 // Page id (int)
252 $this->conf['type'] = $type;
253 // Page type (int)
254 $this->conf['sys_language_uid'] = $sys_language_uid;
255 // sys_language UID of the language of the indexing (int)
256 $this->conf['MP'] = $MP;
257 // MP variable, if any (Mount Points) (string)
258 $this->conf['gr_list'] = '0,-1';
259 // Group list (hardcoded for now...)
260 // cHash values:
261 if ($createCHash) {
262 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
263 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
264 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
265 } else {
266 $this->conf['cHash'] = '';
267 }
268 // cHash string for additional parameters
269 $this->conf['cHash_array'] = $cHash_array;
270 // Array of the additional parameters
271 // Set to defaults
272 $this->conf['freeIndexUid'] = 0;
273 $this->conf['freeIndexSetId'] = 0;
274 $this->conf['page_cache_reg1'] = '';
275 // Root line uids
276 $this->conf['rootline_uids'] = $uidRL;
277 // Configuration of behavior:
278 $this->conf['index_externals'] = 1;
279 // Whether to index external documents like PDF, DOC etc. (if possible)
280 $this->conf['index_descrLgd'] = 200;
281 // Length of description text (max 250, default 200)
282 $this->conf['index_metatags'] = TRUE;
283 // Whether to index document keywords and description (if present)
284 // Init and start indexing:
285 $this->init();
286 }
287
288 /**
289 * Sets the free-index uid. Can be called right after backend_initIndexer()
290 *
291 * @param int Free index UID
292 * @param int Set id - an integer identifying the "set" of indexing operations.
293 * @return void
294 */
295 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0) {
296 $this->conf['freeIndexUid'] = $freeIndexUid;
297 $this->conf['freeIndexSetId'] = $freeIndexSetId;
298 }
299
300 /**
301 * Indexing records as the content of a TYPO3 page.
302 *
303 * @param string Title equivalent
304 * @param string Keywords equivalent
305 * @param string Description equivalent
306 * @param string The main content to index
307 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
308 * @param int Last modification time, in seconds
309 * @param int The creation date of the content, in seconds
310 * @param int The record UID that the content comes from (for registration with the indexed rows)
311 * @return void
312 */
313 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0) {
314 // Content of page:
315 $this->conf['mtime'] = $mtime;
316 // Most recent modification time (seconds) of the content
317 $this->conf['crdate'] = $crdate;
318 // The creation date of the TYPO3 content
319 $this->conf['recordUid'] = $recordUid;
320 // UID of the record, if applicable
321 // Construct fake HTML for parsing:
322 $this->conf['content'] = '
323 <html>
324 <head>
325 <title>' . htmlspecialchars($title) . '</title>
326 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
327 <meta name="description" content="' . htmlspecialchars($description) . '" />
328 </head>
329 <body>
330 ' . htmlspecialchars($content) . '
331 </body>
332 </html>';
333 // Content string (HTML of TYPO3 page)
334 // Initializing charset:
335 $this->conf['metaCharset'] = $charset;
336 // Character set of content (will be converted to utf-8 during indexing)
337 $this->conf['indexedDocTitle'] = '';
338 // Alternative title for indexing
339 // Index content as if it was a TYPO3 page:
340 $this->indexTypo3PageContent();
341 }
342
343 /********************************
344 *
345 * Initialization
346 *
347 *******************************/
348 /**
349 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
350 *
351 * @return void
352 */
353 public function init() {
354 // Initializing:
355 $this->cHashParams = $this->conf['cHash_array'];
356 if (is_array($this->cHashParams) && count($this->cHashParams)) {
357 if ($this->conf['cHash']) {
358 // Add this so that URL's come out right...
359 $this->cHashParams['cHash'] = $this->conf['cHash'];
360 }
361 unset($this->cHashParams['encryptionKey']);
362 }
363 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
364 $this->setT3Hashes();
365 // Indexer configuration from Extension Manager interface:
366 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
367 $this->tstamp_minAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
368 $this->tstamp_maxAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
369 $this->maxExternalFiles = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
370 $this->flagBitMask = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
371 // Workaround: If the extension configuration was not updated yet, the value is not existing
372 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
373 $this->storeMetaphoneInfoAsWords = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
374 // Initialize external document parsers:
375 // Example configuration, see ext_localconf.php of this file!
376 if ($this->conf['index_externals']) {
377 $this->initializeExternalParsers();
378 }
379 // Initialize lexer (class that deconstructs the text into words):
380 $lexerObjRef = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
381 $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
382 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
383 // Initialize metaphone hook:
384 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
385 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
386 $this->metaphoneObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
387 $this->metaphoneObj->pObj = $this;
388 }
389 // Init charset class:
390 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
391 }
392
393 /**
394 * Initialize external parsers
395 *
396 * @return void
397 * @access private
398 * @see init()
399 */
400 public function initializeExternalParsers() {
401 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'])) {
402 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
403 $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
404 $this->external_parsers[$extension]->pObj = $this;
405 // Init parser and if it returns FALSE, unset its entry again:
406 if (!$this->external_parsers[$extension]->initParser($extension)) {
407 unset($this->external_parsers[$extension]);
408 }
409 }
410 }
411 }
412
413 /********************************
414 *
415 * Indexing; TYPO3 pages (HTML content)
416 *
417 *******************************/
418 /**
419 * Start indexing of the TYPO3 page
420 *
421 * @return void
422 */
423 public function indexTypo3PageContent() {
424 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
425 $is_grlist = $this->is_grlist_set($this->hash['phash']);
426 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
427 // Setting message:
428 if ($this->forceIndexing) {
429 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
430 } elseif ($check > 0) {
431 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
432 } else {
433 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
434 }
435 // Divide into title,keywords,description and body:
436 $this->log_push('Split content', '');
437 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
438 if ($this->conf['indexedDocTitle']) {
439 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
440 }
441 $this->log_pull();
442 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
443 $this->content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
444 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
445 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
446 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
447 $checkCHash = $this->checkContentHash();
448 if (!is_array($checkCHash) || $check === 1) {
449 $Pstart = GeneralUtility::milliseconds();
450 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
451 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
452 $this->log_pull();
453 // Splitting words
454 $this->log_push('Extract words from content', '');
455 $splitInWords = $this->processWordsInArrays($this->contentParts);
456 $this->log_pull();
457 // Analyse the indexed words.
458 $this->log_push('Analyse the extracted words', '');
459 $indexArr = $this->indexAnalyze($splitInWords);
460 $this->log_pull();
461 // Submitting page (phash) record
462 $this->log_push('Submitting page', '');
463 $this->submitPage();
464 $this->log_pull();
465 // Check words and submit to word list if not there
466 $this->log_push('Check word list and submit words', '');
467 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
468 $this->checkWordList($indexArr);
469 $this->submitWords($indexArr, $this->hash['phash']);
470 }
471 $this->log_pull();
472 // Set parsetime
473 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
474 // Checking external files if configured for.
475 $this->log_push('Checking external files', '');
476 if ($this->conf['index_externals']) {
477 $this->extractLinks($this->conf['content']);
478 }
479 $this->log_pull();
480 } else {
481 // Update the timestamp
482 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
483 $this->updateSetId($this->hash['phash']);
484 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
485 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
486 $this->updateRootline();
487 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
488 }
489 } else {
490 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
491 }
492 }
493
494 /**
495 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
496 *
497 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
498 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
499 * @see splitRegularContent()
500 */
501 public function splitHTMLContent($content) {
502 // divide head from body ( u-ouh :) )
503 $contentArr = $this->defaultContentArray;
504 $contentArr['body'] = stristr($content, '<body');
505 $headPart = substr($content, 0, -strlen($contentArr['body']));
506 // get title
507 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
508 $titleParts = explode(':', $contentArr['title'], 2);
509 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
510 // get keywords and description metatags
511 if ($this->conf['index_metatags']) {
512 $meta = array();
513 $i = 0;
514 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
515 $i++;
516 }
517 // @todo The code below stops at first unset tag. Is that correct?
518 for ($i = 0; isset($meta[$i]); $i++) {
519 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
520 if (stristr($meta[$i]['name'], 'keywords')) {
521 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
522 }
523 if (stristr($meta[$i]['name'], 'description')) {
524 $contentArr['description'] .= ',' . $meta[$i]['content'];
525 }
526 }
527 }
528 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
529 $this->typoSearchTags($contentArr['body']);
530 // Get rid of unwanted sections (ie. scripting and style stuff) in body
531 $tagList = explode(',', $this->excludeSections);
532 foreach ($tagList as $tag) {
533 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
534
535 }
536 }
537 // remove tags, but first make sure we don't concatenate words by doing it
538 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
539 $contentArr['body'] = trim(strip_tags($contentArr['body']));
540 $contentArr['keywords'] = trim($contentArr['keywords']);
541 $contentArr['description'] = trim($contentArr['description']);
542 // Return array
543 return $contentArr;
544 }
545
546 /**
547 * Extract the charset value from HTML meta tag.
548 *
549 * @param string HTML content
550 * @return string The charset value if found.
551 */
552 public function getHTMLcharset($content) {
553 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
554 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
555 return $reg2[1];
556 }
557 }
558 }
559
560 /**
561 * Converts a HTML document to utf-8
562 *
563 * @param string HTML content, any charset
564 * @param string Optional charset (otherwise extracted from HTML)
565 * @return string Converted HTML
566 */
567 public function convertHTMLToUtf8($content, $charset = '') {
568 // Find charset:
569 $charset = $charset ?: $this->getHTMLcharset($content);
570 $charset = $this->csObj->parse_charset($charset);
571 // Convert charset:
572 if ($charset && $charset !== 'utf-8') {
573 $content = $this->csObj->utf8_encode($content, $charset);
574 }
575 // Convert entities, assuming document is now UTF-8:
576 $content = $this->csObj->entities_to_utf8($content, TRUE);
577 return $content;
578 }
579
580 /**
581 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
582 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
583 * <title> of document or removing <script>-sections
584 *
585 * @param string String to search in
586 * @param string Tag name, eg. "script
587 * @param string Passed by reference: Content inside found tag
588 * @param string Passed by reference: Content after found tag
589 * @param string Passed by reference: Attributes of the found tag.
590 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
591 */
592 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
593 $endTag = '</' . $tagName . '>';
594 $startTag = '<' . $tagName;
595 // stristr used because we want a case-insensitive search for the tag.
596 $isTagInText = stristr($string, $startTag);
597 // if the tag was not found, return FALSE
598 if (!$isTagInText) {
599 return FALSE;
600 }
601 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
602 $afterTagInText = stristr($isTagInText, $endTag);
603 if ($afterTagInText) {
604 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
605 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
606 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
607 } else {
608 $tagContent = '';
609 $stringAfter = $isTagInText;
610 }
611 return TRUE;
612 }
613
614 /**
615 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
616 *
617 * @param string HTML Content, passed by reference
618 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
619 */
620 public function typoSearchTags(&$body) {
621 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
622 if (count($expBody) > 1) {
623 $body = '';
624 foreach ($expBody as $val) {
625 $part = explode('-->', $val, 2);
626 if (trim($part[0]) == 'begin') {
627 $body .= $part[1];
628 $prev = '';
629 } elseif (trim($part[0]) == 'end') {
630 $body .= $prev;
631 } else {
632 $prev = $val;
633 }
634 }
635 return TRUE;
636 } else {
637 return FALSE;
638 }
639 }
640
641 /**
642 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
643 *
644 * @param string HTML content
645 * @return void
646 */
647 public function extractLinks($content) {
648 // Get links:
649 $list = $this->extractHyperLinks($content);
650 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
651 $this->includeCrawlerClass();
652 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
653 }
654 // Traverse links:
655 foreach ($list as $linkInfo) {
656 // Decode entities:
657 if ($linkInfo['localPath']) {
658 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
659 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
660 } else {
661 $linkSource = htmlspecialchars_decode($linkInfo['href']);
662 }
663 // Parse URL:
664 $qParts = parse_url($linkSource);
665 // Check for jumpurl (TYPO3 specific thing...)
666 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
667 parse_str($qParts['query'], $getP);
668 $linkSource = $getP['jumpurl'];
669 $qParts = parse_url($linkSource);
670 }
671 if (!$linkInfo['localPath'] && $qParts['scheme']) {
672 if ($this->indexerConfig['indexExternalURLs']) {
673 // Index external URL (http or otherwise)
674 $this->indexExternalUrl($linkSource);
675 }
676 } elseif (!$qParts['query']) {
677 $linkSource = urldecode($linkSource);
678 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
679 $localFile = $linkSource;
680 } else {
681 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
682 }
683 if ($localFile && @is_file($localFile)) {
684 // Index local file:
685 if ($linkInfo['localPath']) {
686 $fI = pathinfo($linkSource);
687 $ext = strtolower($fI['extension']);
688 if (is_object($crawler)) {
689 $params = array(
690 'document' => $linkSource,
691 'alturl' => $linkInfo['href'],
692 'conf' => $this->conf
693 );
694 unset($params['conf']['content']);
695 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
696 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
697 } else {
698 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
699 }
700 } else {
701 if (is_object($crawler)) {
702 $params = array(
703 'document' => $linkSource,
704 'conf' => $this->conf
705 );
706 unset($params['conf']['content']);
707 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
708 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
709 } else {
710 $this->indexRegularDocument($linkSource);
711 }
712 }
713 }
714 }
715 }
716 }
717
718 /**
719 * Extracts all links to external documents from the HTML content string
720 *
721 * @param string $html
722 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
723 * @see extractLinks()
724 */
725 public function extractHyperLinks($html) {
726 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
727 $htmlParts = $htmlParser->splitTags('a', $html);
728 $hyperLinksData = array();
729 foreach ($htmlParts as $index => $tagData) {
730 if ($index % 2 !== 0) {
731 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
732 $firstTagName = $htmlParser->getFirstTagName($tagData);
733 if (strtolower($firstTagName) == 'a') {
734 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
735 $hyperLinksData[] = array(
736 'tag' => $tagData,
737 'href' => $tagAttributes[0]['href'],
738 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
739 );
740 }
741 }
742 }
743 }
744 return $hyperLinksData;
745 }
746
747 /**
748 * Extracts the "base href" from content string.
749 *
750 * @param string Content to analyze
751 * @return string The base href or an empty string if not found
752 */
753 public function extractBaseHref($html) {
754 $href = '';
755 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
756 $htmlParts = $htmlParser->splitTags('base', $html);
757 foreach ($htmlParts as $index => $tagData) {
758 if ($index % 2 !== 0) {
759 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
760 $firstTagName = $htmlParser->getFirstTagName($tagData);
761 if (strtolower($firstTagName) == 'base') {
762 $href = $tagAttributes[0]['href'];
763 if ($href) {
764 break;
765 }
766 }
767 }
768 }
769 return $href;
770 }
771
772 /******************************************
773 *
774 * Indexing; external URL
775 *
776 ******************************************/
777 /**
778 * Index External URLs HTML content
779 *
780 * @param string URL, eg. "http://typo3.org/
781 * @return void
782 * @see indexRegularDocument()
783 */
784 public function indexExternalUrl($externalUrl) {
785 // Parse External URL:
786 $qParts = parse_url($externalUrl);
787 $fI = pathinfo($qParts['path']);
788 $ext = strtolower($fI['extension']);
789 // Get headers:
790 $urlHeaders = $this->getUrlHeaders($externalUrl);
791 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
792 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
793 if ((string)$content !== '') {
794 // Create temporary file:
795 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
796 if ($tmpFile) {
797 GeneralUtility::writeFile($tmpFile, $content);
798 // Index that file:
799 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
800 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
801 unlink($tmpFile);
802 }
803 }
804 }
805 }
806
807 /**
808 * Getting HTTP request headers of URL
809 *
810 * @param string The URL
811 * @param int Timeout (seconds?)
812 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
813 */
814 public function getUrlHeaders($url) {
815 // Try to get the headers only
816 $content = GeneralUtility::getUrl($url, 2);
817 if ((string)$content !== '') {
818 // Compile headers:
819 $headers = GeneralUtility::trimExplode(LF, $content, TRUE);
820 $retVal = array();
821 foreach ($headers as $line) {
822 if (trim($line) === '') {
823 break;
824 }
825 list($headKey, $headValue) = explode(':', $line, 2);
826 $retVal[$headKey] = $headValue;
827 }
828 return $retVal;
829 }
830 }
831
832 /**
833 * Checks if the file is local
834 *
835 * @param $sourcePath
836 * @return string Absolute path to file if file is local, else empty string
837 */
838 protected function createLocalPath($sourcePath) {
839 $localPath = '';
840 static $pathFunctions = array(
841 'createLocalPathFromT3vars',
842 'createLocalPathUsingAbsRefPrefix',
843 'createLocalPathUsingDomainURL',
844 'createLocalPathFromAbsoluteURL',
845 'createLocalPathFromRelativeURL'
846 );
847 foreach ($pathFunctions as $functionName) {
848 $localPath = $this->{$functionName}($sourcePath);
849 if ($localPath != '') {
850 break;
851 }
852 }
853 return $localPath;
854 }
855
856 /**
857 * Attempts to create a local file path from T3VARs. This is useful for
858 * various download extensions that hide actual file name but still want the
859 * file to be indexed.
860 *
861 * @param string $sourcePath
862 * @return string
863 */
864 protected function createLocalPathFromT3vars($sourcePath) {
865 $localPath = '';
866 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
867 if (is_array($indexLocalFiles)) {
868 $md5 = GeneralUtility::shortMD5($sourcePath);
869 // Note: not using self::isAllowedLocalFile here because this method
870 // is allowed to index files outside of the web site (for example,
871 // protected downloads)
872 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
873 $localPath = $indexLocalFiles[$md5];
874 }
875 }
876 return $localPath;
877 }
878
879 /**
880 * Attempts to create a local file path by matching a current request URL.
881 *
882 * @param string $sourcePath
883 * @return string
884 */
885 protected function createLocalPathUsingDomainURL($sourcePath) {
886 $localPath = '';
887 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
888 $baseURLLength = strlen($baseURL);
889 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
890 $sourcePath = substr($sourcePath, $baseURLLength);
891 $localPath = PATH_site . $sourcePath;
892 if (!self::isAllowedLocalFile($localPath)) {
893 $localPath = '';
894 }
895 }
896 return $localPath;
897 }
898
899 /**
900 * Attempts to create a local file path by matching absRefPrefix. This
901 * requires TSFE. If TSFE is missing, this function does nothing.
902 *
903 * @param string $sourcePath
904 * @return string
905 */
906 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
907 $localPath = '';
908 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
909 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
910 $absRefPrefixLength = strlen($absRefPrefix);
911 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
912 $sourcePath = substr($sourcePath, $absRefPrefixLength);
913 $localPath = PATH_site . $sourcePath;
914 if (!self::isAllowedLocalFile($localPath)) {
915 $localPath = '';
916 }
917 }
918 }
919 return $localPath;
920 }
921
922 /**
923 * Attempts to create a local file path from the absolute URL without
924 * schema.
925 *
926 * @param string $sourcePath
927 * @return string
928 */
929 protected function createLocalPathFromAbsoluteURL($sourcePath) {
930 $localPath = '';
931 if ($sourcePath[0] == '/') {
932 $sourcePath = substr($sourcePath, 1);
933 $localPath = PATH_site . $sourcePath;
934 if (!self::isAllowedLocalFile($localPath)) {
935 $localPath = '';
936 }
937 }
938 return $localPath;
939 }
940
941 /**
942 * Attempts to create a local file path from the relative URL.
943 *
944 * @param string $sourcePath
945 * @return string
946 */
947 protected function createLocalPathFromRelativeURL($sourcePath) {
948 $localPath = '';
949 if (self::isRelativeURL($sourcePath)) {
950 $localPath = PATH_site . $sourcePath;
951 if (!self::isAllowedLocalFile($localPath)) {
952 $localPath = '';
953 }
954 }
955 return $localPath;
956 }
957
958 /**
959 * Checks if URL is relative.
960 *
961 * @param string $url
962 * @return bool
963 */
964 static protected function isRelativeURL($url) {
965 $urlParts = @parse_url($url);
966 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
967 }
968
969 /**
970 * Checks if the path points to the file inside the web site
971 *
972 * @param string $filePath
973 * @return bool
974 */
975 static protected function isAllowedLocalFile($filePath) {
976 $filePath = GeneralUtility::resolveBackPath($filePath);
977 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
978 $isFile = is_file($filePath);
979 return $insideWebPath && $isFile;
980 }
981
982 /******************************************
983 *
984 * Indexing; external files (PDF, DOC, etc)
985 *
986 ******************************************/
987 /**
988 * Indexing a regular document given as $file (relative to PATH_site, local file)
989 *
990 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
991 * @param bool If set, indexing is forced (despite content hashes, mtime etc).
992 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
993 * @param string File extension for temporary file.
994 * @return void
995 */
996 public function indexRegularDocument($file, $force = FALSE, $contentTmpFile = '', $altExtension = '') {
997 // Init
998 $fI = pathinfo($file);
999 $ext = $altExtension ?: strtolower($fI['extension']);
1000 // Create abs-path:
1001 if (!$contentTmpFile) {
1002 if (!GeneralUtility::isAbsPath($file)) {
1003 // Relative, prepend PATH_site:
1004 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1005 } else {
1006 // Absolute, pass-through:
1007 $absFile = $file;
1008 }
1009 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1010 } else {
1011 $absFile = $contentTmpFile;
1012 }
1013 // Indexing the document:
1014 if ($absFile && @is_file($absFile)) {
1015 if ($this->external_parsers[$ext]) {
1016 $fileInfo = stat($absFile);
1017 $cParts = $this->fileContentParts($ext, $absFile);
1018 foreach ($cParts as $cPKey) {
1019 $this->internal_log = array();
1020 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1021 $Pstart = GeneralUtility::milliseconds();
1022 $subinfo = array('key' => $cPKey);
1023 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1024 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1025 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1026 if ($check > 0 || $force) {
1027 if ($check > 0) {
1028 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1029 } else {
1030 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1031 }
1032 // Check external file counter:
1033 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1034 // Divide into title,keywords,description and body:
1035 $this->log_push('Split content', '');
1036 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1037 $this->log_pull();
1038 if (is_array($contentParts)) {
1039 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1040 $content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1041 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1042 // Increment counter:
1043 $this->externalFileCounter++;
1044 // Splitting words
1045 $this->log_push('Extract words from content', '');
1046 $splitInWords = $this->processWordsInArrays($contentParts);
1047 $this->log_pull();
1048 // Analyse the indexed words.
1049 $this->log_push('Analyse the extracted words', '');
1050 $indexArr = $this->indexAnalyze($splitInWords);
1051 $this->log_pull();
1052 // Submitting page (phash) record
1053 $this->log_push('Submitting page', '');
1054 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1055 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1056 $this->log_pull();
1057 // Check words and submit to word list if not there
1058 $this->log_push('Check word list and submit words', '');
1059 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1060 $this->checkWordList($indexArr);
1061 $this->submitWords($indexArr, $phash_arr['phash']);
1062 }
1063 $this->log_pull();
1064 // Set parsetime
1065 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1066 } else {
1067 // Update the timestamp
1068 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1069 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1070 }
1071 } else {
1072 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1073 }
1074 } else {
1075 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1076 }
1077 } else {
1078 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1079 }
1080 // Checking and setting sections:
1081 $this->submitFile_section($phash_arr['phash']);
1082 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1083 $this->log_pull();
1084 }
1085 } else {
1086 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1087 }
1088 } else {
1089 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1090 }
1091 }
1092
1093 /**
1094 * Reads the content of an external file being indexed.
1095 * The content from the external parser MUST be returned in utf-8!
1096 *
1097 * @param string File extension, eg. "pdf", "doc" etc.
1098 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1099 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1100 * @return array Standard content array (title, description, keywords, body keys)
1101 */
1102 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1103 $contentArray = NULL;
1104 // Consult relevant external document parser:
1105 if (is_object($this->external_parsers[$fileExtension])) {
1106 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1107 }
1108 return $contentArray;
1109 }
1110
1111 /**
1112 * Creates an array with pointers to divisions of document.
1113 *
1114 * @param string File extension
1115 * @param string Absolute filename (must exist and be validated OK before calling function)
1116 * @return array Array of pointers to sections that the document should be divided into
1117 */
1118 public function fileContentParts($ext, $absFile) {
1119 $cParts = array(0);
1120 // Consult relevant external document parser:
1121 if (is_object($this->external_parsers[$ext])) {
1122 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1123 }
1124 return $cParts;
1125 }
1126
1127 /**
1128 * Splits non-HTML content (from external files for instance)
1129 *
1130 * @param string Input content (non-HTML) to index.
1131 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1132 * @see splitHTMLContent()
1133 */
1134 public function splitRegularContent($content) {
1135 $contentArr = $this->defaultContentArray;
1136 $contentArr['body'] = $content;
1137 return $contentArr;
1138 }
1139
1140 /**********************************
1141 *
1142 * Analysing content, Extracting words
1143 *
1144 **********************************/
1145 /**
1146 * Convert character set and HTML entities in the value of input content array keys
1147 *
1148 * @param array Standard content array
1149 * @param string Charset of the input content (converted to utf-8)
1150 * @return void
1151 */
1152 public function charsetEntity2utf8(&$contentArr, $charset) {
1153 // Convert charset if necessary
1154 foreach ($contentArr as $key => $value) {
1155 if ((string)$contentArr[$key] !== '') {
1156 if ($charset !== 'utf-8') {
1157 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1158 }
1159 // decode all numeric / html-entities in the string to real characters:
1160 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1161 }
1162 }
1163 }
1164
1165 /**
1166 * Processing words in the array from split*Content -functions
1167 *
1168 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1169 * @return array Content input array modified so each key is not a unique array of words
1170 */
1171 public function processWordsInArrays($contentArr) {
1172 // split all parts to words
1173 foreach ($contentArr as $key => $value) {
1174 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1175 }
1176 // For title, keywords, and description we don't want duplicates:
1177 $contentArr['title'] = array_unique($contentArr['title']);
1178 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1179 $contentArr['description'] = array_unique($contentArr['description']);
1180 // Return modified array:
1181 return $contentArr;
1182 }
1183
1184 /**
1185 * Extracts the sample description text from the content array.
1186 *
1187 * @param array Content array
1188 * @return string Description string
1189 */
1190 public function bodyDescription($contentArr) {
1191 // Setting description
1192 $maxL = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1193 if ($maxL) {
1194 $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1195 // Shorten the string:
1196 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1197 }
1198 return $bodyDescription;
1199 }
1200
1201 /**
1202 * Analyzes content to use for indexing,
1203 *
1204 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1205 * @return array Index Array (whatever that is...)
1206 */
1207 public function indexAnalyze($content) {
1208 $indexArr = array();
1209 $counter = 0;
1210 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1211 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1212 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1213 $this->analyzeBody($indexArr, $content);
1214 return $indexArr;
1215 }
1216
1217 /**
1218 * Calculates relevant information for headercontent
1219 *
1220 * @param array Index array, passed by reference
1221 * @param array Standard content array
1222 * @param string Key from standard content array
1223 * @param int Bit-wise priority to type
1224 * @return void
1225 */
1226 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1227 foreach ($content[$key] as $val) {
1228 $val = substr($val, 0, 60);
1229 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1230 if (!isset($retArr[$val])) {
1231 // Word ID (wid)
1232 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1233 // Metaphone value is also 60 only chars long
1234 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1235 $retArr[$val]['metaphone'] = $metaphone;
1236 }
1237 // Build metaphone fulltext string (can be used for fulltext indexing)
1238 if ($this->storeMetaphoneInfoAsWords) {
1239 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1240 }
1241 // Priority used for flagBitMask feature (see extension configuration)
1242 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1243 // Increase number of occurrences
1244 $retArr[$val]['count']++;
1245 $this->wordcount++;
1246 }
1247 }
1248
1249 /**
1250 * Calculates relevant information for bodycontent
1251 *
1252 * @param array Index array, passed by reference
1253 * @param array Standard content array
1254 * @return void
1255 */
1256 public function analyzeBody(&$retArr, $content) {
1257 foreach ($content['body'] as $key => $val) {
1258 $val = substr($val, 0, 60);
1259 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1260 if (!isset($retArr[$val])) {
1261 // First occurrence (used for ranking results)
1262 $retArr[$val]['first'] = $key;
1263 // Word ID (wid)
1264 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1265 // Metaphone value is also only 60 chars long
1266 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1267 $retArr[$val]['metaphone'] = $metaphone;
1268 }
1269 // Build metaphone fulltext string (can be used for fulltext indexing)
1270 if ($this->storeMetaphoneInfoAsWords) {
1271 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1272 }
1273 // Increase number of occurrences
1274 $retArr[$val]['count']++;
1275 $this->wordcount++;
1276 }
1277 }
1278
1279 /**
1280 * Creating metaphone based hash from input word
1281 *
1282 * @param string Word to convert
1283 * @param bool If set, returns the raw metaphone value (not hashed)
1284 * @return mixed Metaphone hash integer (or raw value, string)
1285 */
1286 public function metaphone($word, $returnRawMetaphoneValue = FALSE) {
1287 if (is_object($this->metaphoneObj)) {
1288 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1289 } else {
1290 // Use native PHP function instead of advanced doubleMetaphone class
1291 $metaphoneRawValue = metaphone($word);
1292 }
1293 if ($returnRawMetaphoneValue) {
1294 $result = $metaphoneRawValue;
1295 } elseif ($metaphoneRawValue !== '') {
1296 // Create hash and return integer
1297 $result = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($metaphoneRawValue);
1298 } else {
1299 $result = 0;
1300 }
1301 return $result;
1302 }
1303
1304 /********************************
1305 *
1306 * SQL; TYPO3 Pages
1307 *
1308 *******************************/
1309 /**
1310 * Updates db with information about the page (TYPO3 page, not external media)
1311 *
1312 * @return void
1313 */
1314 public function submitPage() {
1315 // Remove any current data for this phash:
1316 $this->removeOldIndexedPages($this->hash['phash']);
1317 // setting new phash_row
1318 $fields = array(
1319 'phash' => $this->hash['phash'],
1320 'phash_grouping' => $this->hash['phash_grouping'],
1321 'cHashParams' => serialize($this->cHashParams),
1322 'contentHash' => $this->content_md5h,
1323 'data_page_id' => $this->conf['id'],
1324 'data_page_reg1' => $this->conf['page_cache_reg1'],
1325 'data_page_type' => $this->conf['type'],
1326 'data_page_mp' => $this->conf['MP'],
1327 'gr_list' => $this->conf['gr_list'],
1328 'item_type' => 0,
1329 // TYPO3 page
1330 'item_title' => $this->contentParts['title'],
1331 'item_description' => $this->bodyDescription($this->contentParts),
1332 'item_mtime' => (int)$this->conf['mtime'],
1333 'item_size' => strlen($this->conf['content']),
1334 'tstamp' => $GLOBALS['EXEC_TIME'],
1335 'crdate' => $GLOBALS['EXEC_TIME'],
1336 'item_crdate' => $this->conf['crdate'],
1337 // Creation date of page
1338 'sys_language_uid' => $this->conf['sys_language_uid'],
1339 // Sys language uid of the page. Should reflect which language it DOES actually display!
1340 'externalUrl' => 0,
1341 'recordUid' => (int)$this->conf['recordUid'],
1342 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1343 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1344 );
1345 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1346 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1347 }
1348 // PROCESSING index_section
1349 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1350 // PROCESSING index_grlist
1351 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1352 // PROCESSING index_fulltext
1353 $fields = array(
1354 'phash' => $this->hash['phash'],
1355 'fulltextdata' => implode(' ', $this->contentParts),
1356 'metaphonedata' => $this->metaphoneContent
1357 );
1358 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1359 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1360 }
1361 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1362 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1363 }
1364 // PROCESSING index_debug
1365 if ($this->indexerConfig['debugMode']) {
1366 $fields = array(
1367 'phash' => $this->hash['phash'],
1368 'debuginfo' => serialize(array(
1369 'cHashParams' => $this->cHashParams,
1370 'external_parsers initialized' => array_keys($this->external_parsers),
1371 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1372 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1373 'logs' => $this->internal_log,
1374 'lexer' => $this->lexerObj->debugString
1375 ))
1376 );
1377 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1378 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1379 }
1380 }
1381 }
1382
1383 /**
1384 * Stores gr_list in the database.
1385 *
1386 * @param int Search result record phash
1387 * @param int Actual phash of current content
1388 * @return void
1389 * @see update_grlist()
1390 */
1391 public function submit_grlist($hash, $phash_x) {
1392 // Setting the gr_list record
1393 $fields = array(
1394 'phash' => $hash,
1395 'phash_x' => $phash_x,
1396 'hash_gr_list' => \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1397 'gr_list' => $this->conf['gr_list']
1398 );
1399 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1400 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1401 }
1402 }
1403
1404 /**
1405 * Stores section
1406 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1407 *
1408 * @param int phash of TYPO3 parent search result record
1409 * @param int phash of the file indexation search record
1410 * @return void
1411 */
1412 public function submit_section($hash, $hash_t3) {
1413 $fields = array(
1414 'phash' => $hash,
1415 'phash_t3' => $hash_t3,
1416 'page_id' => (int)$this->conf['id']
1417 );
1418 $this->getRootLineFields($fields);
1419 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1420 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1421 }
1422 }
1423
1424 /**
1425 * Removes records for the indexed page, $phash
1426 *
1427 * @param int phash value to flush
1428 * @return void
1429 */
1430 public function removeOldIndexedPages($phash) {
1431 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1432 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1433 foreach ($tableArray as $table) {
1434 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1435 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1436 }
1437 }
1438 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1439 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1440 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . (int)$phash);
1441 }
1442 }
1443
1444 /********************************
1445 *
1446 * SQL; External media
1447 *
1448 *******************************/
1449 /**
1450 * Updates db with information about the file
1451 *
1452 * @param array Array with phash and phash_grouping keys for file
1453 * @param string File name
1454 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1455 * @param string File extension determining the type of media.
1456 * @param int Modification time of file.
1457 * @param int Creation time of file.
1458 * @param int Size of file in bytes
1459 * @param int Content HASH value.
1460 * @param array Standard content array (using only title and body for a file)
1461 * @return void
1462 */
1463 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1464 // Find item Type:
1465 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1466 $storeItemType = $storeItemType ?: $ext;
1467 // Remove any current data for this phash:
1468 $this->removeOldIndexedFiles($hash['phash']);
1469 // Split filename:
1470 $fileParts = parse_url($file);
1471 // Setting new
1472 $fields = array(
1473 'phash' => $hash['phash'],
1474 'phash_grouping' => $hash['phash_grouping'],
1475 'cHashParams' => serialize($subinfo),
1476 'contentHash' => $content_md5h,
1477 'data_filename' => $file,
1478 'item_type' => $storeItemType,
1479 'item_title' => trim($contentParts['title']) ?: basename($file),
1480 'item_description' => $this->bodyDescription($contentParts),
1481 'item_mtime' => $mtime,
1482 'item_size' => $size,
1483 'item_crdate' => $ctime,
1484 'tstamp' => $GLOBALS['EXEC_TIME'],
1485 'crdate' => $GLOBALS['EXEC_TIME'],
1486 'gr_list' => $this->conf['gr_list'],
1487 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1488 'recordUid' => (int)$this->conf['recordUid'],
1489 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1490 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1491 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1492 );
1493 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1494 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1495 }
1496 // PROCESSING index_fulltext
1497 $fields = array(
1498 'phash' => $hash['phash'],
1499 'fulltextdata' => implode(' ', $contentParts),
1500 'metaphonedata' => $this->metaphoneContent
1501 );
1502 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1503 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1504 }
1505 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1506 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1507 }
1508 // PROCESSING index_debug
1509 if ($this->indexerConfig['debugMode']) {
1510 $fields = array(
1511 'phash' => $hash['phash'],
1512 'debuginfo' => serialize(array(
1513 'cHashParams' => $subinfo,
1514 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1515 'logs' => $this->internal_log,
1516 'lexer' => $this->lexerObj->debugString
1517 ))
1518 );
1519 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1520 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1521 }
1522 }
1523 }
1524
1525 /**
1526 * Stores file gr_list for a file IF it does not exist already
1527 *
1528 * @param int phash value of file
1529 * @return void
1530 */
1531 public function submitFile_grlist($hash) {
1532 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1533 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1534 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$hash . ' AND (hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1535 if ($count == 0) {
1536 $this->submit_grlist($hash, $hash);
1537 }
1538 }
1539 }
1540
1541 /**
1542 * Stores file section for a file IF it does not exist
1543 *
1544 * @param int phash value of file
1545 * @return void
1546 */
1547 public function submitFile_section($hash) {
1548 // Testing if there is already a section
1549 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1550 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . (int)$hash . ' AND page_id=' . (int)$this->conf['id']);
1551 if ($count == 0) {
1552 $this->submit_section($hash, $this->hash['phash']);
1553 }
1554 }
1555 }
1556
1557 /**
1558 * Removes records for the indexed page, $phash
1559 *
1560 * @param int phash value to flush
1561 * @return void
1562 */
1563 public function removeOldIndexedFiles($phash) {
1564 // Removing old registrations for tables.
1565 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1566 foreach ($tableArray as $table) {
1567 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1568 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1569 }
1570 }
1571 }
1572
1573 /********************************
1574 *
1575 * SQL Helper functions
1576 *
1577 *******************************/
1578 /**
1579 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1580 * Return positive integer if the page needs to be indexed
1581 *
1582 * @param int mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1583 * @param int "phash" used to select any already indexed page to see what its mtime is.
1584 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1585 */
1586 public function checkMtimeTstamp($mtime, $phash) {
1587 if (!\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1588 // Not indexed (not in index_phash)
1589 $result = 4;
1590 } else {
1591 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . (int)$phash);
1592 // If there was an indexing of the page...:
1593 if ($row) {
1594 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1595 // If max age is exceeded, index the page
1596 // The configured max-age was exceeded for the document and thus it's indexed.
1597 $result = 1;
1598 } else {
1599 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1600 // if minAge is not set or if minAge is exceeded, consider at mtime
1601 if ($mtime) {
1602 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1603 if ($row['item_mtime'] != $mtime) {
1604 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1605 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1606 $result = 2;
1607 } else {
1608 // mtime matched the document, so no changes detected and no content updated
1609 $result = -1;
1610 if ($this->tstamp_maxAge) {
1611 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1612 } else {
1613 $this->updateTstamp($phash);
1614 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1615 }
1616 }
1617 } else {
1618 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1619 $result = 3;
1620 }
1621 } else {
1622 // The minimum age was not exceeded
1623 $result = -2;
1624 }
1625 }
1626 } else {
1627 // Page has never been indexed (is not represented in the index_phash table).
1628 $result = 4;
1629 }
1630 }
1631 return $result;
1632 }
1633
1634 /**
1635 * Check content hash in phash table
1636 *
1637 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1638 */
1639 public function checkContentHash() {
1640 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1641 $result = TRUE;
1642 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1643 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . (int)$this->hash['phash_grouping'] . ' AND contentHash=' . (int)$this->content_md5h);
1644 if ($row) {
1645 $result = $row;
1646 }
1647 }
1648 return $result;
1649 }
1650
1651 /**
1652 * Check content hash for external documents
1653 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1654 *
1655 * @param int phash value to check (phash_grouping)
1656 * @param int Content hash to check
1657 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1658 */
1659 public function checkExternalDocContentHash($hashGr, $content_md5h) {
1660 $result = TRUE;
1661 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1662 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . (int)$hashGr . ' AND contentHash=' . (int)$content_md5h);
1663 $result = $count == 0;
1664 }
1665 return $result;
1666 }
1667
1668 /**
1669 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1670 *
1671 * @param int Phash integer to test.
1672 * @return bool
1673 */
1674 public function is_grlist_set($phash_x) {
1675 $result = FALSE;
1676 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1677 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . (int)$phash_x);
1678 $result = $count > 0;
1679 }
1680 return $result;
1681 }
1682
1683 /**
1684 * Check if an grlist-entry for this hash exists and if not so, write one.
1685 *
1686 * @param int phash of the search result that should be found
1687 * @param int The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1688 * @return void
1689 * @see submit_grlist()
1690 */
1691 public function update_grlist($phash, $phash_x) {
1692 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1693 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$phash . ' AND hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1694 if ($count == 0) {
1695 $this->submit_grlist($phash, $phash_x);
1696 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1697 }
1698 }
1699 }
1700
1701 /**
1702 * Update tstamp for a phash row.
1703 *
1704 * @param int phash value
1705 * @param int If set, update the mtime field to this value.
1706 * @return void
1707 */
1708 public function updateTstamp($phash, $mtime = 0) {
1709 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1710 $updateFields = array(
1711 'tstamp' => $GLOBALS['EXEC_TIME']
1712 );
1713 if ($mtime) {
1714 $updateFields['item_mtime'] = (int)$mtime;
1715 }
1716 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1717 }
1718 }
1719
1720 /**
1721 * Update SetID of the index_phash record.
1722 *
1723 * @param int phash value
1724 * @return void
1725 */
1726 public function updateSetId($phash) {
1727 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1728 $updateFields = array(
1729 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1730 );
1731 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1732 }
1733 }
1734
1735 /**
1736 * Update parsetime for phash row.
1737 *
1738 * @param int phash value.
1739 * @param int Parsetime value to set.
1740 * @return void
1741 */
1742 public function updateParsetime($phash, $parsetime) {
1743 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1744 $updateFields = array(
1745 'parsetime' => (int)$parsetime
1746 );
1747 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1748 }
1749 }
1750
1751 /**
1752 * Update section rootline for the page
1753 *
1754 * @return void
1755 */
1756 public function updateRootline() {
1757 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1758 $updateFields = array();
1759 $this->getRootLineFields($updateFields);
1760 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . (int)$this->conf['id'], $updateFields);
1761 }
1762 }
1763
1764 /**
1765 * Adding values for root-line fields.
1766 * rl0, rl1 and rl2 are standard. A hook might add more.
1767 *
1768 * @param array Field array, passed by reference
1769 * @return void
1770 */
1771 public function getRootLineFields(array &$fieldArray) {
1772 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1773 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1774 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1775 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1776 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1777 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1778 }
1779 }
1780 }
1781
1782 /**
1783 * Removes any indexed pages with userlogins which has the same contentHash
1784 * NOT USED anywhere inside this class!
1785 *
1786 * @return void
1787 */
1788 public function removeLoginpagesWithContentHash() {
1789 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash') && \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1790 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1791 A.phash=B.phash
1792 AND A.phash_grouping=' . (int)$this->hash['phash_grouping'] . '
1793 AND B.hash_gr_list<>' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . '
1794 AND A.contentHash=' . (int)$this->content_md5h);
1795 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1796 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1797 $this->removeOldIndexedPages($row['phash']);
1798 }
1799 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1800 }
1801 }
1802
1803 /**
1804 * Includes the crawler class
1805 *
1806 * @return void
1807 */
1808 public function includeCrawlerClass() {
1809 GeneralUtility::requireOnce(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php');
1810 }
1811
1812 /********************************
1813 *
1814 * SQL; Submitting words
1815 *
1816 *******************************/
1817 /**
1818 * Adds new words to db
1819 *
1820 * @param array $wordListArray Word List array (where each word has information about position etc).
1821 * @return void
1822 */
1823 public function checkWordList($wordListArray) {
1824 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1825 if (count($wordListArray)) {
1826 $phashArray = array();
1827 foreach ($wordListArray as $value) {
1828 $phashArray[] = (int)$value['hash'];
1829 }
1830 $cwl = implode(',', $phashArray);
1831 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1832 if ($count != count($wordListArray)) {
1833 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1834 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
1835 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1836 unset($wordListArray[$row['baseword']]);
1837 }
1838 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1839 foreach ($wordListArray as $key => $val) {
1840 $insertFields = array(
1841 'wid' => $val['hash'],
1842 'baseword' => $key,
1843 'metaphone' => $val['metaphone']
1844 );
1845 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1846 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1847 }
1848 }
1849 }
1850 }
1851 }
1852
1853 /**
1854 * Submits RELATIONS between words and phash
1855 *
1856 * @param array Word list array
1857 * @param int phash value
1858 * @return void
1859 */
1860 public function submitWords($wordList, $phash) {
1861 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_rel')) {
1862 $stopWords = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('wid', 'index_words', 'is_stopword != 0', '', '', '', 'wid');
1863
1864 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . (int)$phash);
1865 $fields = array('phash', 'wid', 'count', 'first', 'freq', 'flags');
1866 $rows = array();
1867 foreach ($wordList as $val) {
1868 if (isset($stopWords[$val['hash']])) {
1869 continue;
1870 }
1871 $rows[] = array(
1872 (int)$phash,
1873 (int)$val['hash'],
1874 (int)$val['count'],
1875 (int)$val['first'],
1876 $this->freqMap($val['count'] / $this->wordcount),
1877 $val['cmp'] & $this->flagBitMask
1878 );
1879 }
1880 $GLOBALS['TYPO3_DB']->exec_INSERTmultipleRows('index_rel', $fields, $rows);
1881 }
1882 }
1883
1884 /**
1885 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1886 * and back.
1887 *
1888 * @param double Frequency
1889 * @return int Frequency in range.
1890 */
1891 public function freqMap($freq) {
1892 $mapFactor = $this->freqMax * 100 * $this->freqRange;
1893 if ($freq <= 1) {
1894 $newFreq = $freq * $mapFactor;
1895 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
1896 } else {
1897 $newFreq = $freq / $mapFactor;
1898 }
1899 return $newFreq;
1900 }
1901
1902 /********************************
1903 *
1904 * Hashing
1905 *
1906 *******************************/
1907 /**
1908 * Get search hash, T3 pages
1909 *
1910 * @return void
1911 */
1912 public function setT3Hashes() {
1913 // Set main array:
1914 $hArray = array(
1915 'id' => (int)$this->conf['id'],
1916 'type' => (int)$this->conf['type'],
1917 'sys_lang' => (int)$this->conf['sys_language_uid'],
1918 'MP' => (string)$this->conf['MP'],
1919 'cHash' => $this->cHashParams
1920 );
1921 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1922 $this->hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
1923 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1924 $hArray['gr_list'] = (string)$this->conf['gr_list'];
1925 $this->hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
1926 }
1927
1928 /**
1929 * Get search hash, external files
1930 *
1931 * @param string File name / path which identifies it on the server
1932 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
1933 * @return array Array with "phash_grouping" and "phash" inside.
1934 */
1935 public function setExtHashes($file, $subinfo = array()) {
1936 // Set main array:
1937 $hash = array();
1938 $hArray = array(
1939 'file' => $file
1940 );
1941 // Set grouping hash:
1942 $hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
1943 // Add subinfo
1944 $hArray['subinfo'] = $subinfo;
1945 $hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
1946 return $hash;
1947 }
1948
1949 /*********************************
1950 *
1951 * Internal logging functions
1952 *
1953 *********************************/
1954 /**
1955 * Push function wrapper for TT logging
1956 *
1957 * @param string Title to set
1958 * @param string Key (?)
1959 * @return void
1960 */
1961 public function log_push($msg, $key) {
1962 if (is_object($GLOBALS['TT'])) {
1963 $GLOBALS['TT']->push($msg, $key);
1964 }
1965 }
1966
1967 /**
1968 * Pull function wrapper for TT logging
1969 *
1970 * @return void
1971 */
1972 public function log_pull() {
1973 if (is_object($GLOBALS['TT'])) {
1974 $GLOBALS['TT']->pull();
1975 }
1976 }
1977
1978 /**
1979 * Set log message function wrapper for TT logging
1980 *
1981 * @param string Message to set
1982 * @param int Error number
1983 * @return void
1984 */
1985 public function log_setTSlogMessage($msg, $errorNum = 0) {
1986 if (is_object($GLOBALS['TT'])) {
1987 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
1988 }
1989 $this->internal_log[] = $msg;
1990 }
1991
1992 /**************************
1993 *
1994 * \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController hooks:
1995 *
1996 **************************/
1997 /**
1998 * Makes sure that keywords are space-separated. This is impotant for their
1999 * proper displaying as a part of fulltext index.
2000 *
2001 * @param string $keywordList
2002 * @return string
2003 * @see http://forge.typo3.org/issues/14959
2004 */
2005 protected function addSpacesToKeywordList($keywordList) {
2006 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2007 return ' ' . implode(', ', $keywords) . ' ';
2008 }
2009
2010 }