9891cd3ea90cc8b6a46f9ceefa7cc25994639c3b
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18
19 /**
20 * This class is a search indexer for TYPO3
21 *
22 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
23 */
24 /**
25 * Indexing class for TYPO3 frontend
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 */
29 class Indexer {
30
31 // Messages:
32 public $reasons = array(
33 -1 => 'mtime matched the document, so no changes detected and no content updated',
34 -2 => 'The minimum age was not exceeded',
35 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
36 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
37 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
38 4 => 'Page has never been indexed (is not represented in the index_phash table).'
39 );
40
41 // HTML code blocks to exclude from indexing:
42 public $excludeSections = 'script,style';
43
44 // Supported Extensions for external files:
45 public $external_parsers = array();
46
47 // External parser objects, keys are file extension names. Values are objects with certain methods.
48 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
49 public $defaultGrList = '0,-1';
50
51 // Min/Max times:
52 public $tstamp_maxAge = 0;
53
54 // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
55 public $tstamp_minAge = 0;
56
57 // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
58 public $maxExternalFiles = 0;
59
60 // Max number of external files to index.
61 public $forceIndexing = FALSE;
62
63 // If TRUE, indexing is forced despite of hashes etc.
64 public $crawlerActive = FALSE;
65
66 // Set when crawler is detected (internal)
67 // INTERNALS:
68 public $defaultContentArray = array(
69 'title' => '',
70 'description' => '',
71 'keywords' => '',
72 'body' => ''
73 );
74
75 public $wordcount = 0;
76
77 public $externalFileCounter = 0;
78
79 public $conf = array();
80
81 // Configuration set internally (see init functions for required keys and their meaning)
82 public $indexerConfig = array();
83
84 // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
85 public $hash = array();
86
87 // Hash array, contains phash and phash_grouping
88 public $file_phash_arr = array();
89
90 // Hash array for files
91 public $contentParts = array();
92
93 // Content of TYPO3 page
94 public $content_md5h = '';
95
96 public $internal_log = array();
97
98 // Internal log
99 public $indexExternalUrl_content = '';
100
101 public $cHashParams = array();
102
103 // cHashparams array
104 public $freqRange = 32000;
105
106 public $freqMax = 0.1;
107
108 public $enableMetaphoneSearch = FALSE;
109
110 public $storeMetaphoneInfoAsWords;
111
112 public $metaphoneContent = '';
113
114 // Objects:
115 /**
116 * Charset class object
117 *
118 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
119 */
120 public $csObj;
121
122 /**
123 * Metaphone object, if any
124 *
125 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
126 */
127 public $metaphoneObj;
128
129 /**
130 * Lexer object for word splitting
131 *
132 * @var \TYPO3\CMS\IndexedSearch\Lexer
133 */
134 public $lexerObj;
135
136 public $flagBitMask;
137
138 /**
139 * Parent Object (TSFE) Initialization
140 *
141 * @param object Parent Object (frontend TSFE object), passed by reference
142 * @return void
143 */
144 public function hook_indexContent(&$pObj) {
145 // Indexer configuration from Extension Manager interface:
146 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
147 // Crawler activation:
148 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
149 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
150 // Setting simple log message:
151 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
152 // Setting variables:
153 $this->crawlerActive = TRUE;
154 // Crawler active flag
155 $this->forceIndexing = TRUE;
156 }
157 // Determine if page should be indexed, and if so, configure and initialize indexer
158 if ($pObj->config['config']['index_enable']) {
159 $this->log_push('Index page', '');
160 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
161 if (!$pObj->page['no_search']) {
162 if (!$pObj->no_cache) {
163 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
164 // Setting up internal configuration from config array:
165 $this->conf = array();
166 // Information about page for which the indexing takes place
167 $this->conf['id'] = $pObj->id;
168 // Page id
169 $this->conf['type'] = $pObj->type;
170 // Page type
171 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
172 // sys_language UID of the language of the indexing.
173 $this->conf['MP'] = $pObj->MP;
174 // MP variable, if any (Mount Points)
175 $this->conf['gr_list'] = $pObj->gr_list;
176 // Group list
177 $this->conf['cHash'] = $pObj->cHash;
178 // cHash string for additional parameters
179 $this->conf['cHash_array'] = $pObj->cHash_array;
180 // Array of the additional parameters
181 $this->conf['crdate'] = $pObj->page['crdate'];
182 // The creation date of the TYPO3 page
183 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
184 // reg1 of the caching table. Not known what practical use this has.
185 // Root line uids
186 $this->conf['rootline_uids'] = array();
187 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
188 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
189 }
190 // Content of page:
191 $this->conf['content'] = $pObj->content;
192 // Content string (HTML of TYPO3 page)
193 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
194 // Alternative title for indexing
195 $this->conf['metaCharset'] = $pObj->metaCharset;
196 // Character set of content (will be converted to utf-8 during indexing)
197 $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
198 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
199 // Configuration of behavior:
200 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
201 // Whether to index external documents like PDF, DOC etc. (if possible)
202 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
203 // Length of description text (max 250, default 200)
204 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
205 // Set to zero:
206 $this->conf['recordUid'] = 0;
207 $this->conf['freeIndexUid'] = 0;
208 $this->conf['freeIndexSetId'] = 0;
209 // Init and start indexing:
210 $this->init();
211 $this->indexTypo3PageContent();
212 } else {
213 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
214 }
215 } else {
216 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
217 }
218 } else {
219 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
220 }
221 } else {
222 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
223 }
224 $this->log_pull();
225 }
226 }
227
228 /****************************
229 *
230 * Backend API
231 *
232 ****************************/
233 /**
234 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
235 *
236 * @param int The page uid, &id=
237 * @param int The page type, &type=
238 * @param int sys_language uid, typically &L=
239 * @param string The MP variable (Mount Points), &MP=
240 * @param array Rootline array of only UIDs.
241 * @param array Array of GET variables to register with this indexing
242 * @param bool If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
243 * @return void
244 */
245 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = FALSE) {
246 // Setting up internal configuration from config array:
247 $this->conf = array();
248 // Information about page for which the indexing takes place
249 $this->conf['id'] = $id;
250 // Page id (int)
251 $this->conf['type'] = $type;
252 // Page type (int)
253 $this->conf['sys_language_uid'] = $sys_language_uid;
254 // sys_language UID of the language of the indexing (int)
255 $this->conf['MP'] = $MP;
256 // MP variable, if any (Mount Points) (string)
257 $this->conf['gr_list'] = '0,-1';
258 // Group list (hardcoded for now...)
259 // cHash values:
260 if ($createCHash) {
261 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
262 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
263 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
264 } else {
265 $this->conf['cHash'] = '';
266 }
267 // cHash string for additional parameters
268 $this->conf['cHash_array'] = $cHash_array;
269 // Array of the additional parameters
270 // Set to defaults
271 $this->conf['freeIndexUid'] = 0;
272 $this->conf['freeIndexSetId'] = 0;
273 $this->conf['page_cache_reg1'] = '';
274 // Root line uids
275 $this->conf['rootline_uids'] = $uidRL;
276 // Configuration of behavior:
277 $this->conf['index_externals'] = 1;
278 // Whether to index external documents like PDF, DOC etc. (if possible)
279 $this->conf['index_descrLgd'] = 200;
280 // Length of description text (max 250, default 200)
281 $this->conf['index_metatags'] = TRUE;
282 // Whether to index document keywords and description (if present)
283 // Init and start indexing:
284 $this->init();
285 }
286
287 /**
288 * Sets the free-index uid. Can be called right after backend_initIndexer()
289 *
290 * @param int Free index UID
291 * @param int Set id - an integer identifying the "set" of indexing operations.
292 * @return void
293 */
294 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0) {
295 $this->conf['freeIndexUid'] = $freeIndexUid;
296 $this->conf['freeIndexSetId'] = $freeIndexSetId;
297 }
298
299 /**
300 * Indexing records as the content of a TYPO3 page.
301 *
302 * @param string Title equivalent
303 * @param string Keywords equivalent
304 * @param string Description equivalent
305 * @param string The main content to index
306 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
307 * @param int Last modification time, in seconds
308 * @param int The creation date of the content, in seconds
309 * @param int The record UID that the content comes from (for registration with the indexed rows)
310 * @return void
311 */
312 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0) {
313 // Content of page:
314 $this->conf['mtime'] = $mtime;
315 // Most recent modification time (seconds) of the content
316 $this->conf['crdate'] = $crdate;
317 // The creation date of the TYPO3 content
318 $this->conf['recordUid'] = $recordUid;
319 // UID of the record, if applicable
320 // Construct fake HTML for parsing:
321 $this->conf['content'] = '
322 <html>
323 <head>
324 <title>' . htmlspecialchars($title) . '</title>
325 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
326 <meta name="description" content="' . htmlspecialchars($description) . '" />
327 </head>
328 <body>
329 ' . htmlspecialchars($content) . '
330 </body>
331 </html>';
332 // Content string (HTML of TYPO3 page)
333 // Initializing charset:
334 $this->conf['metaCharset'] = $charset;
335 // Character set of content (will be converted to utf-8 during indexing)
336 $this->conf['indexedDocTitle'] = '';
337 // Alternative title for indexing
338 // Index content as if it was a TYPO3 page:
339 $this->indexTypo3PageContent();
340 }
341
342 /********************************
343 *
344 * Initialization
345 *
346 *******************************/
347 /**
348 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
349 *
350 * @return void
351 */
352 public function init() {
353 // Initializing:
354 $this->cHashParams = $this->conf['cHash_array'];
355 if (is_array($this->cHashParams) && count($this->cHashParams)) {
356 if ($this->conf['cHash']) {
357 // Add this so that URL's come out right...
358 $this->cHashParams['cHash'] = $this->conf['cHash'];
359 }
360 unset($this->cHashParams['encryptionKey']);
361 }
362 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
363 $this->setT3Hashes();
364 // Indexer configuration from Extension Manager interface:
365 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
366 $this->tstamp_minAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
367 $this->tstamp_maxAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
368 $this->maxExternalFiles = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
369 $this->flagBitMask = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
370 // Workaround: If the extension configuration was not updated yet, the value is not existing
371 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
372 $this->storeMetaphoneInfoAsWords = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
373 // Initialize external document parsers:
374 // Example configuration, see ext_localconf.php of this file!
375 if ($this->conf['index_externals']) {
376 $this->initializeExternalParsers();
377 }
378 // Initialize lexer (class that deconstructs the text into words):
379 $lexerObjRef = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
380 $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
381 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
382 // Initialize metaphone hook:
383 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
384 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
385 $this->metaphoneObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
386 $this->metaphoneObj->pObj = $this;
387 }
388 // Init charset class:
389 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
390 }
391
392 /**
393 * Initialize external parsers
394 *
395 * @return void
396 * @access private
397 * @see init()
398 */
399 public function initializeExternalParsers() {
400 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'])) {
401 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
402 $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
403 $this->external_parsers[$extension]->pObj = $this;
404 // Init parser and if it returns FALSE, unset its entry again:
405 if (!$this->external_parsers[$extension]->initParser($extension)) {
406 unset($this->external_parsers[$extension]);
407 }
408 }
409 }
410 }
411
412 /********************************
413 *
414 * Indexing; TYPO3 pages (HTML content)
415 *
416 *******************************/
417 /**
418 * Start indexing of the TYPO3 page
419 *
420 * @return void
421 */
422 public function indexTypo3PageContent() {
423 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
424 $is_grlist = $this->is_grlist_set($this->hash['phash']);
425 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
426 // Setting message:
427 if ($this->forceIndexing) {
428 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
429 } elseif ($check > 0) {
430 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
431 } else {
432 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
433 }
434 // Divide into title,keywords,description and body:
435 $this->log_push('Split content', '');
436 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
437 if ($this->conf['indexedDocTitle']) {
438 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
439 }
440 $this->log_pull();
441 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
442 $this->content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
443 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
444 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
445 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
446 $checkCHash = $this->checkContentHash();
447 if (!is_array($checkCHash) || $check === 1) {
448 $Pstart = GeneralUtility::milliseconds();
449 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
450 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
451 $this->log_pull();
452 // Splitting words
453 $this->log_push('Extract words from content', '');
454 $splitInWords = $this->processWordsInArrays($this->contentParts);
455 $this->log_pull();
456 // Analyse the indexed words.
457 $this->log_push('Analyse the extracted words', '');
458 $indexArr = $this->indexAnalyze($splitInWords);
459 $this->log_pull();
460 // Submitting page (phash) record
461 $this->log_push('Submitting page', '');
462 $this->submitPage();
463 $this->log_pull();
464 // Check words and submit to word list if not there
465 $this->log_push('Check word list and submit words', '');
466 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
467 $this->checkWordList($indexArr);
468 $this->submitWords($indexArr, $this->hash['phash']);
469 }
470 $this->log_pull();
471 // Set parsetime
472 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
473 // Checking external files if configured for.
474 $this->log_push('Checking external files', '');
475 if ($this->conf['index_externals']) {
476 $this->extractLinks($this->conf['content']);
477 }
478 $this->log_pull();
479 } else {
480 // Update the timestamp
481 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
482 $this->updateSetId($this->hash['phash']);
483 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
484 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
485 $this->updateRootline();
486 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
487 }
488 } else {
489 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
490 }
491 }
492
493 /**
494 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
495 *
496 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
497 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
498 * @see splitRegularContent()
499 */
500 public function splitHTMLContent($content) {
501 // divide head from body ( u-ouh :) )
502 $contentArr = $this->defaultContentArray;
503 $contentArr['body'] = stristr($content, '<body');
504 $headPart = substr($content, 0, -strlen($contentArr['body']));
505 // get title
506 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
507 $titleParts = explode(':', $contentArr['title'], 2);
508 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
509 // get keywords and description metatags
510 if ($this->conf['index_metatags']) {
511 $meta = array();
512 $i = 0;
513 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
514 $i++;
515 }
516 // @todo The code below stops at first unset tag. Is that correct?
517 for ($i = 0; isset($meta[$i]); $i++) {
518 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
519 if (stristr($meta[$i]['name'], 'keywords')) {
520 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
521 }
522 if (stristr($meta[$i]['name'], 'description')) {
523 $contentArr['description'] .= ',' . $meta[$i]['content'];
524 }
525 }
526 }
527 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
528 $this->typoSearchTags($contentArr['body']);
529 // Get rid of unwanted sections (ie. scripting and style stuff) in body
530 $tagList = explode(',', $this->excludeSections);
531 foreach ($tagList as $tag) {
532 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
533
534 }
535 }
536 // remove tags, but first make sure we don't concatenate words by doing it
537 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
538 $contentArr['body'] = trim(strip_tags($contentArr['body']));
539 $contentArr['keywords'] = trim($contentArr['keywords']);
540 $contentArr['description'] = trim($contentArr['description']);
541 // Return array
542 return $contentArr;
543 }
544
545 /**
546 * Extract the charset value from HTML meta tag.
547 *
548 * @param string HTML content
549 * @return string The charset value if found.
550 */
551 public function getHTMLcharset($content) {
552 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
553 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
554 return $reg2[1];
555 }
556 }
557 }
558
559 /**
560 * Converts a HTML document to utf-8
561 *
562 * @param string HTML content, any charset
563 * @param string Optional charset (otherwise extracted from HTML)
564 * @return string Converted HTML
565 */
566 public function convertHTMLToUtf8($content, $charset = '') {
567 // Find charset:
568 $charset = $charset ?: $this->getHTMLcharset($content);
569 $charset = $this->csObj->parse_charset($charset);
570 // Convert charset:
571 if ($charset && $charset !== 'utf-8') {
572 $content = $this->csObj->utf8_encode($content, $charset);
573 }
574 // Convert entities, assuming document is now UTF-8:
575 $content = $this->csObj->entities_to_utf8($content, TRUE);
576 return $content;
577 }
578
579 /**
580 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
581 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
582 * <title> of document or removing <script>-sections
583 *
584 * @param string String to search in
585 * @param string Tag name, eg. "script
586 * @param string Passed by reference: Content inside found tag
587 * @param string Passed by reference: Content after found tag
588 * @param string Passed by reference: Attributes of the found tag.
589 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
590 */
591 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
592 $endTag = '</' . $tagName . '>';
593 $startTag = '<' . $tagName;
594 // stristr used because we want a case-insensitive search for the tag.
595 $isTagInText = stristr($string, $startTag);
596 // if the tag was not found, return FALSE
597 if (!$isTagInText) {
598 return FALSE;
599 }
600 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
601 $afterTagInText = stristr($isTagInText, $endTag);
602 if ($afterTagInText) {
603 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
604 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
605 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
606 } else {
607 $tagContent = '';
608 $stringAfter = $isTagInText;
609 }
610 return TRUE;
611 }
612
613 /**
614 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
615 *
616 * @param string HTML Content, passed by reference
617 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
618 */
619 public function typoSearchTags(&$body) {
620 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
621 if (count($expBody) > 1) {
622 $body = '';
623 foreach ($expBody as $val) {
624 $part = explode('-->', $val, 2);
625 if (trim($part[0]) == 'begin') {
626 $body .= $part[1];
627 $prev = '';
628 } elseif (trim($part[0]) == 'end') {
629 $body .= $prev;
630 } else {
631 $prev = $val;
632 }
633 }
634 return TRUE;
635 } else {
636 return FALSE;
637 }
638 }
639
640 /**
641 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
642 *
643 * @param string HTML content
644 * @return void
645 */
646 public function extractLinks($content) {
647 // Get links:
648 $list = $this->extractHyperLinks($content);
649 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
650 $this->includeCrawlerClass();
651 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
652 }
653 // Traverse links:
654 foreach ($list as $linkInfo) {
655 // Decode entities:
656 if ($linkInfo['localPath']) {
657 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
658 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
659 } else {
660 $linkSource = htmlspecialchars_decode($linkInfo['href']);
661 }
662 // Parse URL:
663 $qParts = parse_url($linkSource);
664 // Check for jumpurl (TYPO3 specific thing...)
665 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
666 parse_str($qParts['query'], $getP);
667 $linkSource = $getP['jumpurl'];
668 $qParts = parse_url($linkSource);
669 }
670 if (!$linkInfo['localPath'] && $qParts['scheme']) {
671 if ($this->indexerConfig['indexExternalURLs']) {
672 // Index external URL (http or otherwise)
673 $this->indexExternalUrl($linkSource);
674 }
675 } elseif (!$qParts['query']) {
676 $linkSource = urldecode($linkSource);
677 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
678 $localFile = $linkSource;
679 } else {
680 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
681 }
682 if ($localFile && @is_file($localFile)) {
683 // Index local file:
684 if ($linkInfo['localPath']) {
685 $fI = pathinfo($linkSource);
686 $ext = strtolower($fI['extension']);
687 if (is_object($crawler)) {
688 $params = array(
689 'document' => $linkSource,
690 'alturl' => $linkInfo['href'],
691 'conf' => $this->conf
692 );
693 unset($params['conf']['content']);
694 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
695 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
696 } else {
697 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
698 }
699 } else {
700 if (is_object($crawler)) {
701 $params = array(
702 'document' => $linkSource,
703 'conf' => $this->conf
704 );
705 unset($params['conf']['content']);
706 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
707 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
708 } else {
709 $this->indexRegularDocument($linkSource);
710 }
711 }
712 }
713 }
714 }
715 }
716
717 /**
718 * Extracts all links to external documents from the HTML content string
719 *
720 * @param string $html
721 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
722 * @see extractLinks()
723 */
724 public function extractHyperLinks($html) {
725 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
726 $htmlParts = $htmlParser->splitTags('a', $html);
727 $hyperLinksData = array();
728 foreach ($htmlParts as $index => $tagData) {
729 if ($index % 2 !== 0) {
730 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
731 $firstTagName = $htmlParser->getFirstTagName($tagData);
732 if (strtolower($firstTagName) == 'a') {
733 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
734 $hyperLinksData[] = array(
735 'tag' => $tagData,
736 'href' => $tagAttributes[0]['href'],
737 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
738 );
739 }
740 }
741 }
742 }
743 return $hyperLinksData;
744 }
745
746 /**
747 * Extracts the "base href" from content string.
748 *
749 * @param string Content to analyze
750 * @return string The base href or an empty string if not found
751 */
752 public function extractBaseHref($html) {
753 $href = '';
754 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
755 $htmlParts = $htmlParser->splitTags('base', $html);
756 foreach ($htmlParts as $index => $tagData) {
757 if ($index % 2 !== 0) {
758 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
759 $firstTagName = $htmlParser->getFirstTagName($tagData);
760 if (strtolower($firstTagName) == 'base') {
761 $href = $tagAttributes[0]['href'];
762 if ($href) {
763 break;
764 }
765 }
766 }
767 }
768 return $href;
769 }
770
771 /******************************************
772 *
773 * Indexing; external URL
774 *
775 ******************************************/
776 /**
777 * Index External URLs HTML content
778 *
779 * @param string URL, eg. "http://typo3.org/
780 * @return void
781 * @see indexRegularDocument()
782 */
783 public function indexExternalUrl($externalUrl) {
784 // Parse External URL:
785 $qParts = parse_url($externalUrl);
786 $fI = pathinfo($qParts['path']);
787 $ext = strtolower($fI['extension']);
788 // Get headers:
789 $urlHeaders = $this->getUrlHeaders($externalUrl);
790 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
791 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
792 if ((string)$content !== '') {
793 // Create temporary file:
794 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
795 if ($tmpFile) {
796 GeneralUtility::writeFile($tmpFile, $content);
797 // Index that file:
798 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
799 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
800 unlink($tmpFile);
801 }
802 }
803 }
804 }
805
806 /**
807 * Getting HTTP request headers of URL
808 *
809 * @param string The URL
810 * @param int Timeout (seconds?)
811 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
812 */
813 public function getUrlHeaders($url) {
814 // Try to get the headers only
815 $content = GeneralUtility::getUrl($url, 2);
816 if ((string)$content !== '') {
817 // Compile headers:
818 $headers = GeneralUtility::trimExplode(LF, $content, TRUE);
819 $retVal = array();
820 foreach ($headers as $line) {
821 if (trim($line) === '') {
822 break;
823 }
824 list($headKey, $headValue) = explode(':', $line, 2);
825 $retVal[$headKey] = $headValue;
826 }
827 return $retVal;
828 }
829 }
830
831 /**
832 * Checks if the file is local
833 *
834 * @param $sourcePath
835 * @return string Absolute path to file if file is local, else empty string
836 */
837 protected function createLocalPath($sourcePath) {
838 $localPath = '';
839 static $pathFunctions = array(
840 'createLocalPathFromT3vars',
841 'createLocalPathUsingAbsRefPrefix',
842 'createLocalPathUsingDomainURL',
843 'createLocalPathFromAbsoluteURL',
844 'createLocalPathFromRelativeURL'
845 );
846 foreach ($pathFunctions as $functionName) {
847 $localPath = $this->{$functionName}($sourcePath);
848 if ($localPath != '') {
849 break;
850 }
851 }
852 return $localPath;
853 }
854
855 /**
856 * Attempts to create a local file path from T3VARs. This is useful for
857 * various download extensions that hide actual file name but still want the
858 * file to be indexed.
859 *
860 * @param string $sourcePath
861 * @return string
862 */
863 protected function createLocalPathFromT3vars($sourcePath) {
864 $localPath = '';
865 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
866 if (is_array($indexLocalFiles)) {
867 $md5 = GeneralUtility::shortMD5($sourcePath);
868 // Note: not using self::isAllowedLocalFile here because this method
869 // is allowed to index files outside of the web site (for example,
870 // protected downloads)
871 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
872 $localPath = $indexLocalFiles[$md5];
873 }
874 }
875 return $localPath;
876 }
877
878 /**
879 * Attempts to create a local file path by matching a current request URL.
880 *
881 * @param string $sourcePath
882 * @return string
883 */
884 protected function createLocalPathUsingDomainURL($sourcePath) {
885 $localPath = '';
886 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
887 $baseURLLength = strlen($baseURL);
888 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
889 $sourcePath = substr($sourcePath, $baseURLLength);
890 $localPath = PATH_site . $sourcePath;
891 if (!self::isAllowedLocalFile($localPath)) {
892 $localPath = '';
893 }
894 }
895 return $localPath;
896 }
897
898 /**
899 * Attempts to create a local file path by matching absRefPrefix. This
900 * requires TSFE. If TSFE is missing, this function does nothing.
901 *
902 * @param string $sourcePath
903 * @return string
904 */
905 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
906 $localPath = '';
907 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
908 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
909 $absRefPrefixLength = strlen($absRefPrefix);
910 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
911 $sourcePath = substr($sourcePath, $absRefPrefixLength);
912 $localPath = PATH_site . $sourcePath;
913 if (!self::isAllowedLocalFile($localPath)) {
914 $localPath = '';
915 }
916 }
917 }
918 return $localPath;
919 }
920
921 /**
922 * Attempts to create a local file path from the absolute URL without
923 * schema.
924 *
925 * @param string $sourcePath
926 * @return string
927 */
928 protected function createLocalPathFromAbsoluteURL($sourcePath) {
929 $localPath = '';
930 if ($sourcePath[0] == '/') {
931 $sourcePath = substr($sourcePath, 1);
932 $localPath = PATH_site . $sourcePath;
933 if (!self::isAllowedLocalFile($localPath)) {
934 $localPath = '';
935 }
936 }
937 return $localPath;
938 }
939
940 /**
941 * Attempts to create a local file path from the relative URL.
942 *
943 * @param string $sourcePath
944 * @return string
945 */
946 protected function createLocalPathFromRelativeURL($sourcePath) {
947 $localPath = '';
948 if (self::isRelativeURL($sourcePath)) {
949 $localPath = PATH_site . $sourcePath;
950 if (!self::isAllowedLocalFile($localPath)) {
951 $localPath = '';
952 }
953 }
954 return $localPath;
955 }
956
957 /**
958 * Checks if URL is relative.
959 *
960 * @param string $url
961 * @return bool
962 */
963 static protected function isRelativeURL($url) {
964 $urlParts = @parse_url($url);
965 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
966 }
967
968 /**
969 * Checks if the path points to the file inside the web site
970 *
971 * @param string $filePath
972 * @return bool
973 */
974 static protected function isAllowedLocalFile($filePath) {
975 $filePath = GeneralUtility::resolveBackPath($filePath);
976 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
977 $isFile = is_file($filePath);
978 return $insideWebPath && $isFile;
979 }
980
981 /******************************************
982 *
983 * Indexing; external files (PDF, DOC, etc)
984 *
985 ******************************************/
986 /**
987 * Indexing a regular document given as $file (relative to PATH_site, local file)
988 *
989 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
990 * @param bool If set, indexing is forced (despite content hashes, mtime etc).
991 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
992 * @param string File extension for temporary file.
993 * @return void
994 */
995 public function indexRegularDocument($file, $force = FALSE, $contentTmpFile = '', $altExtension = '') {
996 // Init
997 $fI = pathinfo($file);
998 $ext = $altExtension ?: strtolower($fI['extension']);
999 // Create abs-path:
1000 if (!$contentTmpFile) {
1001 if (!GeneralUtility::isAbsPath($file)) {
1002 // Relative, prepend PATH_site:
1003 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1004 } else {
1005 // Absolute, pass-through:
1006 $absFile = $file;
1007 }
1008 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1009 } else {
1010 $absFile = $contentTmpFile;
1011 }
1012 // Indexing the document:
1013 if ($absFile && @is_file($absFile)) {
1014 if ($this->external_parsers[$ext]) {
1015 $fileInfo = stat($absFile);
1016 $cParts = $this->fileContentParts($ext, $absFile);
1017 foreach ($cParts as $cPKey) {
1018 $this->internal_log = array();
1019 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1020 $Pstart = GeneralUtility::milliseconds();
1021 $subinfo = array('key' => $cPKey);
1022 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1023 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1024 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1025 if ($check > 0 || $force) {
1026 if ($check > 0) {
1027 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1028 } else {
1029 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1030 }
1031 // Check external file counter:
1032 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1033 // Divide into title,keywords,description and body:
1034 $this->log_push('Split content', '');
1035 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1036 $this->log_pull();
1037 if (is_array($contentParts)) {
1038 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1039 $content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1040 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1041 // Increment counter:
1042 $this->externalFileCounter++;
1043 // Splitting words
1044 $this->log_push('Extract words from content', '');
1045 $splitInWords = $this->processWordsInArrays($contentParts);
1046 $this->log_pull();
1047 // Analyse the indexed words.
1048 $this->log_push('Analyse the extracted words', '');
1049 $indexArr = $this->indexAnalyze($splitInWords);
1050 $this->log_pull();
1051 // Submitting page (phash) record
1052 $this->log_push('Submitting page', '');
1053 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1054 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1055 $this->log_pull();
1056 // Check words and submit to word list if not there
1057 $this->log_push('Check word list and submit words', '');
1058 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1059 $this->checkWordList($indexArr);
1060 $this->submitWords($indexArr, $phash_arr['phash']);
1061 }
1062 $this->log_pull();
1063 // Set parsetime
1064 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1065 } else {
1066 // Update the timestamp
1067 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1068 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1069 }
1070 } else {
1071 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1072 }
1073 } else {
1074 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1075 }
1076 } else {
1077 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1078 }
1079 // Checking and setting sections:
1080 $this->submitFile_section($phash_arr['phash']);
1081 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1082 $this->log_pull();
1083 }
1084 } else {
1085 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1086 }
1087 } else {
1088 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1089 }
1090 }
1091
1092 /**
1093 * Reads the content of an external file being indexed.
1094 * The content from the external parser MUST be returned in utf-8!
1095 *
1096 * @param string File extension, eg. "pdf", "doc" etc.
1097 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1098 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1099 * @return array Standard content array (title, description, keywords, body keys)
1100 */
1101 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1102 $contentArray = NULL;
1103 // Consult relevant external document parser:
1104 if (is_object($this->external_parsers[$fileExtension])) {
1105 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1106 }
1107 return $contentArray;
1108 }
1109
1110 /**
1111 * Creates an array with pointers to divisions of document.
1112 *
1113 * @param string File extension
1114 * @param string Absolute filename (must exist and be validated OK before calling function)
1115 * @return array Array of pointers to sections that the document should be divided into
1116 */
1117 public function fileContentParts($ext, $absFile) {
1118 $cParts = array(0);
1119 // Consult relevant external document parser:
1120 if (is_object($this->external_parsers[$ext])) {
1121 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1122 }
1123 return $cParts;
1124 }
1125
1126 /**
1127 * Splits non-HTML content (from external files for instance)
1128 *
1129 * @param string Input content (non-HTML) to index.
1130 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1131 * @see splitHTMLContent()
1132 */
1133 public function splitRegularContent($content) {
1134 $contentArr = $this->defaultContentArray;
1135 $contentArr['body'] = $content;
1136 return $contentArr;
1137 }
1138
1139 /**********************************
1140 *
1141 * Analysing content, Extracting words
1142 *
1143 **********************************/
1144 /**
1145 * Convert character set and HTML entities in the value of input content array keys
1146 *
1147 * @param array Standard content array
1148 * @param string Charset of the input content (converted to utf-8)
1149 * @return void
1150 */
1151 public function charsetEntity2utf8(&$contentArr, $charset) {
1152 // Convert charset if necessary
1153 foreach ($contentArr as $key => $value) {
1154 if ((string)$contentArr[$key] !== '') {
1155 if ($charset !== 'utf-8') {
1156 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1157 }
1158 // decode all numeric / html-entities in the string to real characters:
1159 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1160 }
1161 }
1162 }
1163
1164 /**
1165 * Processing words in the array from split*Content -functions
1166 *
1167 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1168 * @return array Content input array modified so each key is not a unique array of words
1169 */
1170 public function processWordsInArrays($contentArr) {
1171 // split all parts to words
1172 foreach ($contentArr as $key => $value) {
1173 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1174 }
1175 // For title, keywords, and description we don't want duplicates:
1176 $contentArr['title'] = array_unique($contentArr['title']);
1177 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1178 $contentArr['description'] = array_unique($contentArr['description']);
1179 // Return modified array:
1180 return $contentArr;
1181 }
1182
1183 /**
1184 * Extracts the sample description text from the content array.
1185 *
1186 * @param array Content array
1187 * @return string Description string
1188 */
1189 public function bodyDescription($contentArr) {
1190 // Setting description
1191 $maxL = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1192 if ($maxL) {
1193 $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1194 // Shorten the string:
1195 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1196 }
1197 return $bodyDescription;
1198 }
1199
1200 /**
1201 * Analyzes content to use for indexing,
1202 *
1203 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1204 * @return array Index Array (whatever that is...)
1205 */
1206 public function indexAnalyze($content) {
1207 $indexArr = array();
1208 $counter = 0;
1209 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1210 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1211 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1212 $this->analyzeBody($indexArr, $content);
1213 return $indexArr;
1214 }
1215
1216 /**
1217 * Calculates relevant information for headercontent
1218 *
1219 * @param array Index array, passed by reference
1220 * @param array Standard content array
1221 * @param string Key from standard content array
1222 * @param int Bit-wise priority to type
1223 * @return void
1224 */
1225 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1226 foreach ($content[$key] as $val) {
1227 $val = substr($val, 0, 60);
1228 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1229 if (!isset($retArr[$val])) {
1230 // Word ID (wid)
1231 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1232 // Metaphone value is also 60 only chars long
1233 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1234 $retArr[$val]['metaphone'] = $metaphone;
1235 }
1236 // Build metaphone fulltext string (can be used for fulltext indexing)
1237 if ($this->storeMetaphoneInfoAsWords) {
1238 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1239 }
1240 // Priority used for flagBitMask feature (see extension configuration)
1241 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1242 // Increase number of occurrences
1243 $retArr[$val]['count']++;
1244 $this->wordcount++;
1245 }
1246 }
1247
1248 /**
1249 * Calculates relevant information for bodycontent
1250 *
1251 * @param array Index array, passed by reference
1252 * @param array Standard content array
1253 * @return void
1254 */
1255 public function analyzeBody(&$retArr, $content) {
1256 foreach ($content['body'] as $key => $val) {
1257 $val = substr($val, 0, 60);
1258 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1259 if (!isset($retArr[$val])) {
1260 // First occurrence (used for ranking results)
1261 $retArr[$val]['first'] = $key;
1262 // Word ID (wid)
1263 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1264 // Metaphone value is also only 60 chars long
1265 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1266 $retArr[$val]['metaphone'] = $metaphone;
1267 }
1268 // Build metaphone fulltext string (can be used for fulltext indexing)
1269 if ($this->storeMetaphoneInfoAsWords) {
1270 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1271 }
1272 // Increase number of occurrences
1273 $retArr[$val]['count']++;
1274 $this->wordcount++;
1275 }
1276 }
1277
1278 /**
1279 * Creating metaphone based hash from input word
1280 *
1281 * @param string Word to convert
1282 * @param bool If set, returns the raw metaphone value (not hashed)
1283 * @return mixed Metaphone hash integer (or raw value, string)
1284 */
1285 public function metaphone($word, $returnRawMetaphoneValue = FALSE) {
1286 if (is_object($this->metaphoneObj)) {
1287 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1288 } else {
1289 // Use native PHP function instead of advanced doubleMetaphone class
1290 $metaphoneRawValue = metaphone($word);
1291 }
1292 if ($returnRawMetaphoneValue) {
1293 $result = $metaphoneRawValue;
1294 } elseif ($metaphoneRawValue !== '') {
1295 // Create hash and return integer
1296 $result = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($metaphoneRawValue);
1297 } else {
1298 $result = 0;
1299 }
1300 return $result;
1301 }
1302
1303 /********************************
1304 *
1305 * SQL; TYPO3 Pages
1306 *
1307 *******************************/
1308 /**
1309 * Updates db with information about the page (TYPO3 page, not external media)
1310 *
1311 * @return void
1312 */
1313 public function submitPage() {
1314 // Remove any current data for this phash:
1315 $this->removeOldIndexedPages($this->hash['phash']);
1316 // setting new phash_row
1317 $fields = array(
1318 'phash' => $this->hash['phash'],
1319 'phash_grouping' => $this->hash['phash_grouping'],
1320 'cHashParams' => serialize($this->cHashParams),
1321 'contentHash' => $this->content_md5h,
1322 'data_page_id' => $this->conf['id'],
1323 'data_page_reg1' => $this->conf['page_cache_reg1'],
1324 'data_page_type' => $this->conf['type'],
1325 'data_page_mp' => $this->conf['MP'],
1326 'gr_list' => $this->conf['gr_list'],
1327 'item_type' => 0,
1328 // TYPO3 page
1329 'item_title' => $this->contentParts['title'],
1330 'item_description' => $this->bodyDescription($this->contentParts),
1331 'item_mtime' => (int)$this->conf['mtime'],
1332 'item_size' => strlen($this->conf['content']),
1333 'tstamp' => $GLOBALS['EXEC_TIME'],
1334 'crdate' => $GLOBALS['EXEC_TIME'],
1335 'item_crdate' => $this->conf['crdate'],
1336 // Creation date of page
1337 'sys_language_uid' => $this->conf['sys_language_uid'],
1338 // Sys language uid of the page. Should reflect which language it DOES actually display!
1339 'externalUrl' => 0,
1340 'recordUid' => (int)$this->conf['recordUid'],
1341 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1342 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1343 );
1344 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1345 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1346 }
1347 // PROCESSING index_section
1348 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1349 // PROCESSING index_grlist
1350 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1351 // PROCESSING index_fulltext
1352 $fields = array(
1353 'phash' => $this->hash['phash'],
1354 'fulltextdata' => implode(' ', $this->contentParts),
1355 'metaphonedata' => $this->metaphoneContent
1356 );
1357 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1358 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1359 }
1360 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1361 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1362 }
1363 // PROCESSING index_debug
1364 if ($this->indexerConfig['debugMode']) {
1365 $fields = array(
1366 'phash' => $this->hash['phash'],
1367 'debuginfo' => serialize(array(
1368 'cHashParams' => $this->cHashParams,
1369 'external_parsers initialized' => array_keys($this->external_parsers),
1370 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1371 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1372 'logs' => $this->internal_log,
1373 'lexer' => $this->lexerObj->debugString
1374 ))
1375 );
1376 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1377 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1378 }
1379 }
1380 }
1381
1382 /**
1383 * Stores gr_list in the database.
1384 *
1385 * @param int Search result record phash
1386 * @param int Actual phash of current content
1387 * @return void
1388 * @see update_grlist()
1389 */
1390 public function submit_grlist($hash, $phash_x) {
1391 // Setting the gr_list record
1392 $fields = array(
1393 'phash' => $hash,
1394 'phash_x' => $phash_x,
1395 'hash_gr_list' => \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1396 'gr_list' => $this->conf['gr_list']
1397 );
1398 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1399 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1400 }
1401 }
1402
1403 /**
1404 * Stores section
1405 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1406 *
1407 * @param int phash of TYPO3 parent search result record
1408 * @param int phash of the file indexation search record
1409 * @return void
1410 */
1411 public function submit_section($hash, $hash_t3) {
1412 $fields = array(
1413 'phash' => $hash,
1414 'phash_t3' => $hash_t3,
1415 'page_id' => (int)$this->conf['id']
1416 );
1417 $this->getRootLineFields($fields);
1418 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1419 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1420 }
1421 }
1422
1423 /**
1424 * Removes records for the indexed page, $phash
1425 *
1426 * @param int phash value to flush
1427 * @return void
1428 */
1429 public function removeOldIndexedPages($phash) {
1430 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1431 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1432 foreach ($tableArray as $table) {
1433 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1434 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1435 }
1436 }
1437 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1438 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1439 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . (int)$phash);
1440 }
1441 }
1442
1443 /********************************
1444 *
1445 * SQL; External media
1446 *
1447 *******************************/
1448 /**
1449 * Updates db with information about the file
1450 *
1451 * @param array Array with phash and phash_grouping keys for file
1452 * @param string File name
1453 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1454 * @param string File extension determining the type of media.
1455 * @param int Modification time of file.
1456 * @param int Creation time of file.
1457 * @param int Size of file in bytes
1458 * @param int Content HASH value.
1459 * @param array Standard content array (using only title and body for a file)
1460 * @return void
1461 */
1462 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1463 // Find item Type:
1464 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1465 $storeItemType = $storeItemType ?: $ext;
1466 // Remove any current data for this phash:
1467 $this->removeOldIndexedFiles($hash['phash']);
1468 // Split filename:
1469 $fileParts = parse_url($file);
1470 // Setting new
1471 $fields = array(
1472 'phash' => $hash['phash'],
1473 'phash_grouping' => $hash['phash_grouping'],
1474 'cHashParams' => serialize($subinfo),
1475 'contentHash' => $content_md5h,
1476 'data_filename' => $file,
1477 'item_type' => $storeItemType,
1478 'item_title' => trim($contentParts['title']) ?: basename($file),
1479 'item_description' => $this->bodyDescription($contentParts),
1480 'item_mtime' => $mtime,
1481 'item_size' => $size,
1482 'item_crdate' => $ctime,
1483 'tstamp' => $GLOBALS['EXEC_TIME'],
1484 'crdate' => $GLOBALS['EXEC_TIME'],
1485 'gr_list' => $this->conf['gr_list'],
1486 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1487 'recordUid' => (int)$this->conf['recordUid'],
1488 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1489 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1490 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1491 );
1492 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1493 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1494 }
1495 // PROCESSING index_fulltext
1496 $fields = array(
1497 'phash' => $hash['phash'],
1498 'fulltextdata' => implode(' ', $contentParts),
1499 'metaphonedata' => $this->metaphoneContent
1500 );
1501 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1502 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1503 }
1504 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1505 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1506 }
1507 // PROCESSING index_debug
1508 if ($this->indexerConfig['debugMode']) {
1509 $fields = array(
1510 'phash' => $hash['phash'],
1511 'debuginfo' => serialize(array(
1512 'cHashParams' => $subinfo,
1513 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1514 'logs' => $this->internal_log,
1515 'lexer' => $this->lexerObj->debugString
1516 ))
1517 );
1518 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1519 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1520 }
1521 }
1522 }
1523
1524 /**
1525 * Stores file gr_list for a file IF it does not exist already
1526 *
1527 * @param int phash value of file
1528 * @return void
1529 */
1530 public function submitFile_grlist($hash) {
1531 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1532 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1533 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$hash . ' AND (hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1534 if ($count == 0) {
1535 $this->submit_grlist($hash, $hash);
1536 }
1537 }
1538 }
1539
1540 /**
1541 * Stores file section for a file IF it does not exist
1542 *
1543 * @param int phash value of file
1544 * @return void
1545 */
1546 public function submitFile_section($hash) {
1547 // Testing if there is already a section
1548 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1549 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . (int)$hash . ' AND page_id=' . (int)$this->conf['id']);
1550 if ($count == 0) {
1551 $this->submit_section($hash, $this->hash['phash']);
1552 }
1553 }
1554 }
1555
1556 /**
1557 * Removes records for the indexed page, $phash
1558 *
1559 * @param int phash value to flush
1560 * @return void
1561 */
1562 public function removeOldIndexedFiles($phash) {
1563 // Removing old registrations for tables.
1564 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1565 foreach ($tableArray as $table) {
1566 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1567 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1568 }
1569 }
1570 }
1571
1572 /********************************
1573 *
1574 * SQL Helper functions
1575 *
1576 *******************************/
1577 /**
1578 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1579 * Return positive integer if the page needs to be indexed
1580 *
1581 * @param int mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1582 * @param int "phash" used to select any already indexed page to see what its mtime is.
1583 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1584 */
1585 public function checkMtimeTstamp($mtime, $phash) {
1586 if (!\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1587 // Not indexed (not in index_phash)
1588 $result = 4;
1589 } else {
1590 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . (int)$phash);
1591 // If there was an indexing of the page...:
1592 if ($row) {
1593 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1594 // If max age is exceeded, index the page
1595 // The configured max-age was exceeded for the document and thus it's indexed.
1596 $result = 1;
1597 } else {
1598 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1599 // if minAge is not set or if minAge is exceeded, consider at mtime
1600 if ($mtime) {
1601 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1602 if ($row['item_mtime'] != $mtime) {
1603 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1604 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1605 $result = 2;
1606 } else {
1607 // mtime matched the document, so no changes detected and no content updated
1608 $result = -1;
1609 if ($this->tstamp_maxAge) {
1610 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1611 } else {
1612 $this->updateTstamp($phash);
1613 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1614 }
1615 }
1616 } else {
1617 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1618 $result = 3;
1619 }
1620 } else {
1621 // The minimum age was not exceeded
1622 $result = -2;
1623 }
1624 }
1625 } else {
1626 // Page has never been indexed (is not represented in the index_phash table).
1627 $result = 4;
1628 }
1629 }
1630 return $result;
1631 }
1632
1633 /**
1634 * Check content hash in phash table
1635 *
1636 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1637 */
1638 public function checkContentHash() {
1639 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1640 $result = TRUE;
1641 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1642 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . (int)$this->hash['phash_grouping'] . ' AND contentHash=' . (int)$this->content_md5h);
1643 if ($row) {
1644 $result = $row;
1645 }
1646 }
1647 return $result;
1648 }
1649
1650 /**
1651 * Check content hash for external documents
1652 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1653 *
1654 * @param int phash value to check (phash_grouping)
1655 * @param int Content hash to check
1656 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1657 */
1658 public function checkExternalDocContentHash($hashGr, $content_md5h) {
1659 $result = TRUE;
1660 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1661 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . (int)$hashGr . ' AND contentHash=' . (int)$content_md5h);
1662 $result = $count == 0;
1663 }
1664 return $result;
1665 }
1666
1667 /**
1668 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1669 *
1670 * @param int Phash integer to test.
1671 * @return bool
1672 */
1673 public function is_grlist_set($phash_x) {
1674 $result = FALSE;
1675 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1676 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . (int)$phash_x);
1677 $result = $count > 0;
1678 }
1679 return $result;
1680 }
1681
1682 /**
1683 * Check if an grlist-entry for this hash exists and if not so, write one.
1684 *
1685 * @param int phash of the search result that should be found
1686 * @param int The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1687 * @return void
1688 * @see submit_grlist()
1689 */
1690 public function update_grlist($phash, $phash_x) {
1691 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1692 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$phash . ' AND hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1693 if ($count == 0) {
1694 $this->submit_grlist($phash, $phash_x);
1695 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1696 }
1697 }
1698 }
1699
1700 /**
1701 * Update tstamp for a phash row.
1702 *
1703 * @param int phash value
1704 * @param int If set, update the mtime field to this value.
1705 * @return void
1706 */
1707 public function updateTstamp($phash, $mtime = 0) {
1708 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1709 $updateFields = array(
1710 'tstamp' => $GLOBALS['EXEC_TIME']
1711 );
1712 if ($mtime) {
1713 $updateFields['item_mtime'] = (int)$mtime;
1714 }
1715 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1716 }
1717 }
1718
1719 /**
1720 * Update SetID of the index_phash record.
1721 *
1722 * @param int phash value
1723 * @return void
1724 */
1725 public function updateSetId($phash) {
1726 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1727 $updateFields = array(
1728 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1729 );
1730 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1731 }
1732 }
1733
1734 /**
1735 * Update parsetime for phash row.
1736 *
1737 * @param int phash value.
1738 * @param int Parsetime value to set.
1739 * @return void
1740 */
1741 public function updateParsetime($phash, $parsetime) {
1742 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1743 $updateFields = array(
1744 'parsetime' => (int)$parsetime
1745 );
1746 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1747 }
1748 }
1749
1750 /**
1751 * Update section rootline for the page
1752 *
1753 * @return void
1754 */
1755 public function updateRootline() {
1756 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1757 $updateFields = array();
1758 $this->getRootLineFields($updateFields);
1759 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . (int)$this->conf['id'], $updateFields);
1760 }
1761 }
1762
1763 /**
1764 * Adding values for root-line fields.
1765 * rl0, rl1 and rl2 are standard. A hook might add more.
1766 *
1767 * @param array Field array, passed by reference
1768 * @return void
1769 */
1770 public function getRootLineFields(array &$fieldArray) {
1771 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1772 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1773 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1774 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1775 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1776 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1777 }
1778 }
1779 }
1780
1781 /**
1782 * Removes any indexed pages with userlogins which has the same contentHash
1783 * NOT USED anywhere inside this class!
1784 *
1785 * @return void
1786 */
1787 public function removeLoginpagesWithContentHash() {
1788 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash') && \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1789 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1790 A.phash=B.phash
1791 AND A.phash_grouping=' . (int)$this->hash['phash_grouping'] . '
1792 AND B.hash_gr_list<>' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . '
1793 AND A.contentHash=' . (int)$this->content_md5h);
1794 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1795 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1796 $this->removeOldIndexedPages($row['phash']);
1797 }
1798 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1799 }
1800 }
1801
1802 /**
1803 * Includes the crawler class
1804 *
1805 * @return void
1806 */
1807 public function includeCrawlerClass() {
1808 GeneralUtility::requireOnce(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php');
1809 }
1810
1811 /********************************
1812 *
1813 * SQL; Submitting words
1814 *
1815 *******************************/
1816 /**
1817 * Adds new words to db
1818 *
1819 * @param array $wordListArray Word List array (where each word has information about position etc).
1820 * @return void
1821 */
1822 public function checkWordList($wordListArray) {
1823 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1824 if (count($wordListArray)) {
1825 $phashArray = array();
1826 foreach ($wordListArray as $value) {
1827 $phashArray[] = (int)$value['hash'];
1828 }
1829 $cwl = implode(',', $phashArray);
1830 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1831 if ($count != count($wordListArray)) {
1832 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1833 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
1834 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1835 unset($wordListArray[$row['baseword']]);
1836 }
1837 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1838 foreach ($wordListArray as $key => $val) {
1839 $insertFields = array(
1840 'wid' => $val['hash'],
1841 'baseword' => $key,
1842 'metaphone' => $val['metaphone']
1843 );
1844 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1845 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1846 }
1847 }
1848 }
1849 }
1850 }
1851
1852 /**
1853 * Submits RELATIONS between words and phash
1854 *
1855 * @param array Word list array
1856 * @param int phash value
1857 * @return void
1858 */
1859 public function submitWords($wordList, $phash) {
1860 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_rel')) {
1861 $stopWords = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('wid', 'index_words', 'is_stopword != 0', '', '', '', 'wid');
1862
1863 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . (int)$phash);
1864 $fields = array('phash', 'wid', 'count', 'first', 'freq', 'flags');
1865 $rows = array();
1866 foreach ($wordList as $val) {
1867 if (isset($stopWords[$val['hash']])) {
1868 continue;
1869 }
1870 $rows[] = array(
1871 (int)$phash,
1872 (int)$val['hash'],
1873 (int)$val['count'],
1874 (int)$val['first'],
1875 $this->freqMap($val['count'] / $this->wordcount),
1876 $val['cmp'] & $this->flagBitMask
1877 );
1878 }
1879 $GLOBALS['TYPO3_DB']->exec_INSERTmultipleRows('index_rel', $fields, $rows);
1880 }
1881 }
1882
1883 /**
1884 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1885 * and back.
1886 *
1887 * @param double Frequency
1888 * @return int Frequency in range.
1889 */
1890 public function freqMap($freq) {
1891 $mapFactor = $this->freqMax * 100 * $this->freqRange;
1892 if ($freq <= 1) {
1893 $newFreq = $freq * $mapFactor;
1894 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
1895 } else {
1896 $newFreq = $freq / $mapFactor;
1897 }
1898 return $newFreq;
1899 }
1900
1901 /********************************
1902 *
1903 * Hashing
1904 *
1905 *******************************/
1906 /**
1907 * Get search hash, T3 pages
1908 *
1909 * @return void
1910 */
1911 public function setT3Hashes() {
1912 // Set main array:
1913 $hArray = array(
1914 'id' => (int)$this->conf['id'],
1915 'type' => (int)$this->conf['type'],
1916 'sys_lang' => (int)$this->conf['sys_language_uid'],
1917 'MP' => (string)$this->conf['MP'],
1918 'cHash' => $this->cHashParams
1919 );
1920 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1921 $this->hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
1922 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1923 $hArray['gr_list'] = (string)$this->conf['gr_list'];
1924 $this->hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
1925 }
1926
1927 /**
1928 * Get search hash, external files
1929 *
1930 * @param string File name / path which identifies it on the server
1931 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
1932 * @return array Array with "phash_grouping" and "phash" inside.
1933 */
1934 public function setExtHashes($file, $subinfo = array()) {
1935 // Set main array:
1936 $hash = array();
1937 $hArray = array(
1938 'file' => $file
1939 );
1940 // Set grouping hash:
1941 $hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
1942 // Add subinfo
1943 $hArray['subinfo'] = $subinfo;
1944 $hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
1945 return $hash;
1946 }
1947
1948 /*********************************
1949 *
1950 * Internal logging functions
1951 *
1952 *********************************/
1953 /**
1954 * Push function wrapper for TT logging
1955 *
1956 * @param string Title to set
1957 * @param string Key (?)
1958 * @return void
1959 */
1960 public function log_push($msg, $key) {
1961 if (is_object($GLOBALS['TT'])) {
1962 $GLOBALS['TT']->push($msg, $key);
1963 }
1964 }
1965
1966 /**
1967 * Pull function wrapper for TT logging
1968 *
1969 * @return void
1970 */
1971 public function log_pull() {
1972 if (is_object($GLOBALS['TT'])) {
1973 $GLOBALS['TT']->pull();
1974 }
1975 }
1976
1977 /**
1978 * Set log message function wrapper for TT logging
1979 *
1980 * @param string Message to set
1981 * @param int Error number
1982 * @return void
1983 */
1984 public function log_setTSlogMessage($msg, $errorNum = 0) {
1985 if (is_object($GLOBALS['TT'])) {
1986 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
1987 }
1988 $this->internal_log[] = $msg;
1989 }
1990
1991 /**************************
1992 *
1993 * \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController hooks:
1994 *
1995 **************************/
1996 /**
1997 * Makes sure that keywords are space-separated. This is impotant for their
1998 * proper displaying as a part of fulltext index.
1999 *
2000 * @param string $keywordList
2001 * @return string
2002 * @see http://forge.typo3.org/issues/14959
2003 */
2004 protected function addSpacesToKeywordList($keywordList) {
2005 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2006 return ' ' . implode(', ', $keywords) . ' ';
2007 }
2008
2009 }