[TASK] Rebuild the calcAge functionality
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /**
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18 use TYPO3\CMS\Core\Utility\DateTimeUtility;
19
20 /**
21 * This class is a search indexer for TYPO3
22 *
23 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
24 */
25 /**
26 * Indexing class for TYPO3 frontend
27 *
28 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
29 */
30 class Indexer {
31
32 // Messages:
33 /**
34 * @todo Define visibility
35 */
36 public $reasons = array(
37 -1 => 'mtime matched the document, so no changes detected and no content updated',
38 -2 => 'The minimum age was not exceeded',
39 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
40 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
41 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
42 4 => 'Page has never been indexed (is not represented in the index_phash table).'
43 );
44
45 // HTML code blocks to exclude from indexing:
46 /**
47 * @todo Define visibility
48 */
49 public $excludeSections = 'script,style';
50
51 // Supported Extensions for external files:
52 /**
53 * @todo Define visibility
54 */
55 public $external_parsers = array();
56
57 // External parser objects, keys are file extension names. Values are objects with certain methods.
58 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
59 /**
60 * @todo Define visibility
61 */
62 public $defaultGrList = '0,-1';
63
64 // Min/Max times:
65 /**
66 * @todo Define visibility
67 */
68 public $tstamp_maxAge = 0;
69
70 // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
71 /**
72 * @todo Define visibility
73 */
74 public $tstamp_minAge = 0;
75
76 // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
77 /**
78 * @todo Define visibility
79 */
80 public $maxExternalFiles = 0;
81
82 // Max number of external files to index.
83 /**
84 * @todo Define visibility
85 */
86 public $forceIndexing = FALSE;
87
88 // If TRUE, indexing is forced despite of hashes etc.
89 /**
90 * @todo Define visibility
91 */
92 public $crawlerActive = FALSE;
93
94 // Set when crawler is detected (internal)
95 // INTERNALS:
96 /**
97 * @todo Define visibility
98 */
99 public $defaultContentArray = array(
100 'title' => '',
101 'description' => '',
102 'keywords' => '',
103 'body' => ''
104 );
105
106 /**
107 * @todo Define visibility
108 */
109 public $wordcount = 0;
110
111 /**
112 * @todo Define visibility
113 */
114 public $externalFileCounter = 0;
115
116 /**
117 * @todo Define visibility
118 */
119 public $conf = array();
120
121 // Configuration set internally (see init functions for required keys and their meaning)
122 /**
123 * @todo Define visibility
124 */
125 public $indexerConfig = array();
126
127 // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
128 /**
129 * @todo Define visibility
130 */
131 public $hash = array();
132
133 // Hash array, contains phash and phash_grouping
134 /**
135 * @todo Define visibility
136 */
137 public $file_phash_arr = array();
138
139 // Hash array for files
140 /**
141 * @todo Define visibility
142 */
143 public $contentParts = array();
144
145 // Content of TYPO3 page
146 /**
147 * @todo Define visibility
148 */
149 public $content_md5h = '';
150
151 /**
152 * @todo Define visibility
153 */
154 public $internal_log = array();
155
156 // Internal log
157 /**
158 * @todo Define visibility
159 */
160 public $indexExternalUrl_content = '';
161
162 /**
163 * @todo Define visibility
164 */
165 public $cHashParams = array();
166
167 // cHashparams array
168 /**
169 * @todo Define visibility
170 */
171 public $freqRange = 32000;
172
173 /**
174 * @todo Define visibility
175 */
176 public $freqMax = 0.1;
177
178 /**
179 * @todo Define visibility
180 */
181 public $enableMetaphoneSearch = FALSE;
182
183 /**
184 * @todo Define visibility
185 */
186 public $storeMetaphoneInfoAsWords;
187
188 /**
189 * @todo Define visibility
190 */
191 public $metaphoneContent = '';
192
193 // Objects:
194 /**
195 * Charset class object
196 *
197 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
198 * @todo Define visibility
199 */
200 public $csObj;
201
202 /**
203 * Metaphone object, if any
204 *
205 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
206 * @todo Define visibility
207 */
208 public $metaphoneObj;
209
210 /**
211 * Lexer object for word splitting
212 *
213 * @var \TYPO3\CMS\IndexedSearch\Lexer
214 * @todo Define visibility
215 */
216 public $lexerObj;
217
218 /**
219 * @todo Define visibility
220 */
221 public $flagBitMask;
222
223 /**
224 * Parent Object (TSFE) Initialization
225 *
226 * @param object Parent Object (frontend TSFE object), passed by reference
227 * @return void
228 * @todo Define visibility
229 */
230 public function hook_indexContent(&$pObj) {
231 // Indexer configuration from Extension Manager interface:
232 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
233 // Crawler activation:
234 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
235 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
236 // Setting simple log message:
237 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
238 // Setting variables:
239 $this->crawlerActive = TRUE;
240 // Crawler active flag
241 $this->forceIndexing = TRUE;
242 }
243 // Determine if page should be indexed, and if so, configure and initialize indexer
244 if ($pObj->config['config']['index_enable']) {
245 $this->log_push('Index page', '');
246 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
247 if (!$pObj->page['no_search']) {
248 if (!$pObj->no_cache) {
249 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
250 // Setting up internal configuration from config array:
251 $this->conf = array();
252 // Information about page for which the indexing takes place
253 $this->conf['id'] = $pObj->id;
254 // Page id
255 $this->conf['type'] = $pObj->type;
256 // Page type
257 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
258 // sys_language UID of the language of the indexing.
259 $this->conf['MP'] = $pObj->MP;
260 // MP variable, if any (Mount Points)
261 $this->conf['gr_list'] = $pObj->gr_list;
262 // Group list
263 $this->conf['cHash'] = $pObj->cHash;
264 // cHash string for additional parameters
265 $this->conf['cHash_array'] = $pObj->cHash_array;
266 // Array of the additional parameters
267 $this->conf['crdate'] = $pObj->page['crdate'];
268 // The creation date of the TYPO3 page
269 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
270 // reg1 of the caching table. Not known what practical use this has.
271 // Root line uids
272 $this->conf['rootline_uids'] = array();
273 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
274 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
275 }
276 // Content of page:
277 $this->conf['content'] = $pObj->content;
278 // Content string (HTML of TYPO3 page)
279 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
280 // Alternative title for indexing
281 $this->conf['metaCharset'] = $pObj->metaCharset;
282 // Character set of content (will be converted to utf-8 during indexing)
283 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];
284 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
285 // Configuration of behavior:
286 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
287 // Whether to index external documents like PDF, DOC etc. (if possible)
288 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
289 // Length of description text (max 250, default 200)
290 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
291 // Set to zero:
292 $this->conf['recordUid'] = 0;
293 $this->conf['freeIndexUid'] = 0;
294 $this->conf['freeIndexSetId'] = 0;
295 // Init and start indexing:
296 $this->init();
297 $this->indexTypo3PageContent();
298 } else {
299 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
300 }
301 } else {
302 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
303 }
304 } else {
305 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
306 }
307 } else {
308 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
309 }
310 $this->log_pull();
311 }
312 }
313
314 /****************************
315 *
316 * Backend API
317 *
318 ****************************/
319 /**
320 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
321 *
322 * @param integer The page uid, &id=
323 * @param integer The page type, &type=
324 * @param integer sys_language uid, typically &L=
325 * @param string The MP variable (Mount Points), &MP=
326 * @param array Rootline array of only UIDs.
327 * @param array Array of GET variables to register with this indexing
328 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
329 * @return void
330 * @todo Define visibility
331 */
332 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = FALSE) {
333 // Setting up internal configuration from config array:
334 $this->conf = array();
335 // Information about page for which the indexing takes place
336 $this->conf['id'] = $id;
337 // Page id (integer)
338 $this->conf['type'] = $type;
339 // Page type (integer)
340 $this->conf['sys_language_uid'] = $sys_language_uid;
341 // sys_language UID of the language of the indexing (integer)
342 $this->conf['MP'] = $MP;
343 // MP variable, if any (Mount Points) (string)
344 $this->conf['gr_list'] = '0,-1';
345 // Group list (hardcoded for now...)
346 // cHash values:
347 if ($createCHash) {
348 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
349 $cacheHash = GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\CacheHashCalculator');
350 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
351 } else {
352 $this->conf['cHash'] = '';
353 }
354 // cHash string for additional parameters
355 $this->conf['cHash_array'] = $cHash_array;
356 // Array of the additional parameters
357 // Set to defaults
358 $this->conf['freeIndexUid'] = 0;
359 $this->conf['freeIndexSetId'] = 0;
360 $this->conf['page_cache_reg1'] = '';
361 // Root line uids
362 $this->conf['rootline_uids'] = $uidRL;
363 // Configuration of behavior:
364 $this->conf['index_externals'] = 1;
365 // Whether to index external documents like PDF, DOC etc. (if possible)
366 $this->conf['index_descrLgd'] = 200;
367 // Length of description text (max 250, default 200)
368 $this->conf['index_metatags'] = TRUE;
369 // Whether to index document keywords and description (if present)
370 // Init and start indexing:
371 $this->init();
372 }
373
374 /**
375 * Sets the free-index uid. Can be called right after backend_initIndexer()
376 *
377 * @param integer Free index UID
378 * @param integer Set id - an integer identifying the "set" of indexing operations.
379 * @return void
380 * @todo Define visibility
381 */
382 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0) {
383 $this->conf['freeIndexUid'] = $freeIndexUid;
384 $this->conf['freeIndexSetId'] = $freeIndexSetId;
385 }
386
387 /**
388 * Indexing records as the content of a TYPO3 page.
389 *
390 * @param string Title equivalent
391 * @param string Keywords equivalent
392 * @param string Description equivalent
393 * @param string The main content to index
394 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
395 * @param integer Last modification time, in seconds
396 * @param integer The creation date of the content, in seconds
397 * @param integer The record UID that the content comes from (for registration with the indexed rows)
398 * @return void
399 * @todo Define visibility
400 */
401 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0) {
402 // Content of page:
403 $this->conf['mtime'] = $mtime;
404 // Most recent modification time (seconds) of the content
405 $this->conf['crdate'] = $crdate;
406 // The creation date of the TYPO3 content
407 $this->conf['recordUid'] = $recordUid;
408 // UID of the record, if applicable
409 // Construct fake HTML for parsing:
410 $this->conf['content'] = '
411 <html>
412 <head>
413 <title>' . htmlspecialchars($title) . '</title>
414 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
415 <meta name="description" content="' . htmlspecialchars($description) . '" />
416 </head>
417 <body>
418 ' . htmlspecialchars($content) . '
419 </body>
420 </html>';
421 // Content string (HTML of TYPO3 page)
422 // Initializing charset:
423 $this->conf['metaCharset'] = $charset;
424 // Character set of content (will be converted to utf-8 during indexing)
425 $this->conf['indexedDocTitle'] = '';
426 // Alternative title for indexing
427 // Index content as if it was a TYPO3 page:
428 $this->indexTypo3PageContent();
429 }
430
431 /********************************
432 *
433 * Initialization
434 *
435 *******************************/
436 /**
437 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
438 *
439 * @return void
440 * @todo Define visibility
441 */
442 public function init() {
443 global $TYPO3_CONF_VARS;
444 // Initializing:
445 $this->cHashParams = $this->conf['cHash_array'];
446 if (is_array($this->cHashParams) && count($this->cHashParams)) {
447 if ($this->conf['cHash']) {
448 // Add this so that URL's come out right...
449 $this->cHashParams['cHash'] = $this->conf['cHash'];
450 }
451 unset($this->cHashParams['encryptionKey']);
452 }
453 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
454 $this->setT3Hashes();
455 // Indexer configuration from Extension Manager interface:
456 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
457 $this->tstamp_minAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
458 $this->tstamp_maxAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
459 $this->maxExternalFiles = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
460 $this->flagBitMask = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
461 // Workaround: If the extension configuration was not updated yet, the value is not existing
462 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
463 $this->storeMetaphoneInfoAsWords = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
464 // Initialize external document parsers:
465 // Example configuration, see ext_localconf.php of this file!
466 if ($this->conf['index_externals']) {
467 $this->initializeExternalParsers();
468 }
469 // Initialize lexer (class that deconstructs the text into words):
470 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ? $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
471 $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
472 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
473 // Initialize metaphone hook:
474 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
475 if ($this->enableMetaphoneSearch && $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
476 $this->metaphoneObj = GeneralUtility::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
477 $this->metaphoneObj->pObj = $this;
478 }
479 // Init charset class:
480 $this->csObj = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Charset\\CharsetConverter');
481 }
482
483 /**
484 * Initialize external parsers
485 *
486 * @return void
487 * @access private
488 * @see init()
489 * @todo Define visibility
490 */
491 public function initializeExternalParsers() {
492 global $TYPO3_CONF_VARS;
493 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
494 foreach ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
495 $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
496 $this->external_parsers[$extension]->pObj = $this;
497 // Init parser and if it returns FALSE, unset its entry again:
498 if (!$this->external_parsers[$extension]->initParser($extension)) {
499 unset($this->external_parsers[$extension]);
500 }
501 }
502 }
503 }
504
505 /********************************
506 *
507 * Indexing; TYPO3 pages (HTML content)
508 *
509 *******************************/
510 /**
511 * Start indexing of the TYPO3 page
512 *
513 * @return void
514 * @todo Define visibility
515 */
516 public function indexTypo3PageContent() {
517 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
518 $is_grlist = $this->is_grlist_set($this->hash['phash']);
519 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
520 // Setting message:
521 if ($this->forceIndexing) {
522 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
523 } elseif ($check > 0) {
524 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
525 } else {
526 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
527 }
528 // Divide into title,keywords,description and body:
529 $this->log_push('Split content', '');
530 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
531 if ($this->conf['indexedDocTitle']) {
532 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
533 }
534 $this->log_pull();
535 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
536 $this->content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
537 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
538 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
539 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
540 $checkCHash = $this->checkContentHash();
541 if (!is_array($checkCHash) || $check === 1) {
542 $Pstart = DateTimeUtility::milliseconds();
543 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
544 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
545 $this->log_pull();
546 // Splitting words
547 $this->log_push('Extract words from content', '');
548 $splitInWords = $this->processWordsInArrays($this->contentParts);
549 $this->log_pull();
550 // Analyse the indexed words.
551 $this->log_push('Analyse the extracted words', '');
552 $indexArr = $this->indexAnalyze($splitInWords);
553 $this->log_pull();
554 // Submitting page (phash) record
555 $this->log_push('Submitting page', '');
556 $this->submitPage();
557 $this->log_pull();
558 // Check words and submit to word list if not there
559 $this->log_push('Check word list and submit words', '');
560 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
561 $this->checkWordList($indexArr);
562 $this->submitWords($indexArr, $this->hash['phash']);
563 }
564 $this->log_pull();
565 // Set parsetime
566 $this->updateParsetime($this->hash['phash'], DateTimeUtility::milliseconds() - $Pstart);
567 // Checking external files if configured for.
568 $this->log_push('Checking external files', '');
569 if ($this->conf['index_externals']) {
570 $this->extractLinks($this->conf['content']);
571 }
572 $this->log_pull();
573 } else {
574 // Update the timestamp
575 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
576 $this->updateSetId($this->hash['phash']);
577 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
578 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
579 $this->updateRootline();
580 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
581 }
582 } else {
583 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
584 }
585 }
586
587 /**
588 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
589 *
590 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
591 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
592 * @see splitRegularContent()
593 * @todo Define visibility
594 */
595 public function splitHTMLContent($content) {
596 // divide head from body ( u-ouh :) )
597 $contentArr = $this->defaultContentArray;
598 $contentArr['body'] = stristr($content, '<body');
599 $headPart = substr($content, 0, -strlen($contentArr['body']));
600 // get title
601 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
602 $titleParts = explode(':', $contentArr['title'], 2);
603 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
604 // get keywords and description metatags
605 if ($this->conf['index_metatags']) {
606 $meta = array();
607 $i = 0;
608 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
609 $i++;
610 }
611 // TODO The code below stops at first unset tag. Is that correct?
612 for ($i = 0; isset($meta[$i]); $i++) {
613 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
614 if (stristr($meta[$i]['name'], 'keywords')) {
615 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
616 }
617 if (stristr($meta[$i]['name'], 'description')) {
618 $contentArr['description'] .= ',' . $meta[$i]['content'];
619 }
620 }
621 }
622 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
623 $this->typoSearchTags($contentArr['body']);
624 // Get rid of unwanted sections (ie. scripting and style stuff) in body
625 $tagList = explode(',', $this->excludeSections);
626 foreach ($tagList as $tag) {
627 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
628
629 }
630 }
631 // remove tags, but first make sure we don't concatenate words by doing it
632 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
633 $contentArr['body'] = trim(strip_tags($contentArr['body']));
634 $contentArr['keywords'] = trim($contentArr['keywords']);
635 $contentArr['description'] = trim($contentArr['description']);
636 // Return array
637 return $contentArr;
638 }
639
640 /**
641 * Extract the charset value from HTML meta tag.
642 *
643 * @param string HTML content
644 * @return string The charset value if found.
645 * @todo Define visibility
646 */
647 public function getHTMLcharset($content) {
648 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
649 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
650 return $reg2[1];
651 }
652 }
653 }
654
655 /**
656 * Converts a HTML document to utf-8
657 *
658 * @param string HTML content, any charset
659 * @param string Optional charset (otherwise extracted from HTML)
660 * @return string Converted HTML
661 * @todo Define visibility
662 */
663 public function convertHTMLToUtf8($content, $charset = '') {
664 // Find charset:
665 $charset = $charset ?: $this->getHTMLcharset($content);
666 $charset = $this->csObj->parse_charset($charset);
667 // Convert charset:
668 if ($charset && $charset !== 'utf-8') {
669 $content = $this->csObj->utf8_encode($content, $charset);
670 }
671 // Convert entities, assuming document is now UTF-8:
672 $content = $this->csObj->entities_to_utf8($content, TRUE);
673 return $content;
674 }
675
676 /**
677 * Finds first occurence of embracing tags and returns the embraced content and the original string with
678 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
679 * <title> of document or removing <script>-sections
680 *
681 * @param string String to search in
682 * @param string Tag name, eg. "script
683 * @param string Passed by reference: Content inside found tag
684 * @param string Passed by reference: Content after found tag
685 * @param string Passed by reference: Attributes of the found tag.
686 * @return boolean Returns FALSE if tag was not found, otherwise TRUE.
687 * @todo Define visibility
688 */
689 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
690 $endTag = '</' . $tagName . '>';
691 $startTag = '<' . $tagName;
692 // stristr used because we want a case-insensitive search for the tag.
693 $isTagInText = stristr($string, $startTag);
694 // if the tag was not found, return FALSE
695 if (!$isTagInText) {
696 return FALSE;
697 }
698 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
699 $afterTagInText = stristr($isTagInText, $endTag);
700 if ($afterTagInText) {
701 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
702 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
703 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
704 } else {
705 $tagContent = '';
706 $stringAfter = $isTagInText;
707 }
708 return TRUE;
709 }
710
711 /**
712 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
713 *
714 * @param string HTML Content, passed by reference
715 * @return boolean Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
716 * @todo Define visibility
717 */
718 public function typoSearchTags(&$body) {
719 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
720 if (count($expBody) > 1) {
721 $body = '';
722 foreach ($expBody as $val) {
723 $part = explode('-->', $val, 2);
724 if (trim($part[0]) == 'begin') {
725 $body .= $part[1];
726 $prev = '';
727 } elseif (trim($part[0]) == 'end') {
728 $body .= $prev;
729 } else {
730 $prev = $val;
731 }
732 }
733 return TRUE;
734 } else {
735 return FALSE;
736 }
737 }
738
739 /**
740 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
741 *
742 * @param string HTML content
743 * @return void
744 * @todo Define visibility
745 */
746 public function extractLinks($content) {
747 // Get links:
748 $list = $this->extractHyperLinks($content);
749 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
750 $this->includeCrawlerClass();
751 $crawler = GeneralUtility::makeInstance('tx_crawler_lib');
752 }
753 // Traverse links:
754 foreach ($list as $linkInfo) {
755 // Decode entities:
756 if ($linkInfo['localPath']) {
757 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
758 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
759 } else {
760 $linkSource = htmlspecialchars_decode($linkInfo['href']);
761 }
762 // Parse URL:
763 $qParts = parse_url($linkSource);
764 // Check for jumpurl (TYPO3 specific thing...)
765 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
766 parse_str($qParts['query'], $getP);
767 $linkSource = $getP['jumpurl'];
768 $qParts = parse_url($linkSource);
769 }
770 if (!$linkInfo['localPath'] && $qParts['scheme']) {
771 if ($this->indexerConfig['indexExternalURLs']) {
772 // Index external URL (http or otherwise)
773 $this->indexExternalUrl($linkSource);
774 }
775 } elseif (!$qParts['query']) {
776 $linkSource = urldecode($linkSource);
777 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
778 $localFile = $linkSource;
779 } else {
780 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
781 }
782 if ($localFile && @is_file($localFile)) {
783 // Index local file:
784 if ($linkInfo['localPath']) {
785 $fI = pathinfo($linkSource);
786 $ext = strtolower($fI['extension']);
787 if (is_object($crawler)) {
788 $params = array(
789 'document' => $linkSource,
790 'alturl' => $linkInfo['href'],
791 'conf' => $this->conf
792 );
793 unset($params['conf']['content']);
794 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
795 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
796 } else {
797 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
798 }
799 } else {
800 if (is_object($crawler)) {
801 $params = array(
802 'document' => $linkSource,
803 'conf' => $this->conf
804 );
805 unset($params['conf']['content']);
806 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
807 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
808 } else {
809 $this->indexRegularDocument($linkSource);
810 }
811 }
812 }
813 }
814 }
815 }
816
817 /**
818 * Extracts all links to external documents from the HTML content string
819 *
820 * @param string $html
821 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
822 * @see extractLinks()
823 * @todo Define visibility
824 */
825 public function extractHyperLinks($html) {
826 $htmlParser = GeneralUtility::makeInstance('TYPO3\CMS\Core\Html\HtmlParser');
827 $htmlParts = $htmlParser->splitTags('a', $html);
828 $hyperLinksData = array();
829 foreach ($htmlParts as $index => $tagData) {
830 if ($index % 2 !== 0) {
831 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
832 $firstTagName = $htmlParser->getFirstTagName($tagData);
833 if (strtolower($firstTagName) == 'a') {
834 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
835 $hyperLinksData[] = array(
836 'tag' => $tagData,
837 'href' => $tagAttributes[0]['href'],
838 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
839 );
840 }
841 }
842 }
843 }
844 return $hyperLinksData;
845 }
846
847 /**
848 * Extracts the "base href" from content string.
849 *
850 * @param string Content to analyze
851 * @return string The base href or an empty string if not found
852 */
853 public function extractBaseHref($html) {
854 $href = '';
855 $htmlParser = GeneralUtility::makeInstance('TYPO3\CMS\Core\Html\HtmlParser');
856 $htmlParts = $htmlParser->splitTags('base', $html);
857 foreach ($htmlParts as $index => $tagData) {
858 if ($index % 2 !== 0) {
859 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
860 $firstTagName = $htmlParser->getFirstTagName($tagData);
861 if (strtolower($firstTagName) == 'base') {
862 $href = $tagAttributes[0]['href'];
863 if ($href) {
864 break;
865 }
866 }
867 }
868 }
869 return $href;
870 }
871
872 /******************************************
873 *
874 * Indexing; external URL
875 *
876 ******************************************/
877 /**
878 * Index External URLs HTML content
879 *
880 * @param string URL, eg. "http://typo3.org/
881 * @return void
882 * @see indexRegularDocument()
883 * @todo Define visibility
884 */
885 public function indexExternalUrl($externalUrl) {
886 // Parse External URL:
887 $qParts = parse_url($externalUrl);
888 $fI = pathinfo($qParts['path']);
889 $ext = strtolower($fI['extension']);
890 // Get headers:
891 $urlHeaders = $this->getUrlHeaders($externalUrl);
892 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
893 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
894 if (strlen($content)) {
895 // Create temporary file:
896 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
897 if ($tmpFile) {
898 GeneralUtility::writeFile($tmpFile, $content);
899 // Index that file:
900 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
901 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
902 unlink($tmpFile);
903 }
904 }
905 }
906 }
907
908 /**
909 * Getting HTTP request headers of URL
910 *
911 * @param string The URL
912 * @param integer Timeout (seconds?)
913 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
914 * @todo Define visibility
915 */
916 public function getUrlHeaders($url) {
917 // Try to get the headers only
918 $content = GeneralUtility::getUrl($url, 2);
919 if (strlen($content)) {
920 // Compile headers:
921 $headers = GeneralUtility::trimExplode(LF, $content, TRUE);
922 $retVal = array();
923 foreach ($headers as $line) {
924 if (!strlen(trim($line))) {
925 break;
926 }
927 list($headKey, $headValue) = explode(':', $line, 2);
928 $retVal[$headKey] = $headValue;
929 }
930 return $retVal;
931 }
932 }
933
934 /**
935 * Checks if the file is local
936 *
937 * @param $sourcePath
938 * @return string Absolute path to file if file is local, else empty string
939 */
940 protected function createLocalPath($sourcePath) {
941 $localPath = '';
942 static $pathFunctions = array(
943 'createLocalPathFromT3vars',
944 'createLocalPathUsingAbsRefPrefix',
945 'createLocalPathUsingDomainURL',
946 'createLocalPathFromAbsoluteURL',
947 'createLocalPathFromRelativeURL'
948 );
949 foreach ($pathFunctions as $functionName) {
950 $localPath = $this->{$functionName}($sourcePath);
951 if ($localPath != '') {
952 break;
953 }
954 }
955 return $localPath;
956 }
957
958 /**
959 * Attempts to create a local file path from T3VARs. This is useful for
960 * various download extensions that hide actual file name but still want the
961 * file to be indexed.
962 *
963 * @param string $sourcePath
964 * @return string
965 */
966 protected function createLocalPathFromT3vars($sourcePath) {
967 $localPath = '';
968 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
969 if (is_array($indexLocalFiles)) {
970 $md5 = GeneralUtility::shortMD5($sourcePath);
971 // Note: not using self::isAllowedLocalFile here because this method
972 // is allowed to index files outside of the web site (for example,
973 // protected downloads)
974 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
975 $localPath = $indexLocalFiles[$md5];
976 }
977 }
978 return $localPath;
979 }
980
981 /**
982 * Attempts to create a local file path by matching a current request URL.
983 *
984 * @param string $sourcePath
985 * @return string
986 */
987 protected function createLocalPathUsingDomainURL($sourcePath) {
988 $localPath = '';
989 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
990 $baseURLLength = strlen($baseURL);
991 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
992 $sourcePath = substr($sourcePath, $baseURLLength);
993 $localPath = PATH_site . $sourcePath;
994 if (!self::isAllowedLocalFile($localPath)) {
995 $localPath = '';
996 }
997 }
998 return $localPath;
999 }
1000
1001 /**
1002 * Attempts to create a local file path by matching absRefPrefix. This
1003 * requires TSFE. If TSFE is missing, this function does nothing.
1004 *
1005 * @param string $sourcePath
1006 * @return string
1007 */
1008 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
1009 $localPath = '';
1010 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1011 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1012 $absRefPrefixLength = strlen($absRefPrefix);
1013 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1014 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1015 $localPath = PATH_site . $sourcePath;
1016 if (!self::isAllowedLocalFile($localPath)) {
1017 $localPath = '';
1018 }
1019 }
1020 }
1021 return $localPath;
1022 }
1023
1024 /**
1025 * Attempts to create a local file path from the absolute URL without
1026 * schema.
1027 *
1028 * @param string $sourcePath
1029 * @return string
1030 */
1031 protected function createLocalPathFromAbsoluteURL($sourcePath) {
1032 $localPath = '';
1033 if ($sourcePath[0] == '/') {
1034 $sourcePath = substr($sourcePath, 1);
1035 $localPath = PATH_site . $sourcePath;
1036 if (!self::isAllowedLocalFile($localPath)) {
1037 $localPath = '';
1038 }
1039 }
1040 return $localPath;
1041 }
1042
1043 /**
1044 * Attempts to create a local file path from the relative URL.
1045 *
1046 * @param string $sourcePath
1047 * @return string
1048 */
1049 protected function createLocalPathFromRelativeURL($sourcePath) {
1050 $localPath = '';
1051 if (self::isRelativeURL($sourcePath)) {
1052 $localPath = PATH_site . $sourcePath;
1053 if (!self::isAllowedLocalFile($localPath)) {
1054 $localPath = '';
1055 }
1056 }
1057 return $localPath;
1058 }
1059
1060 /**
1061 * Checks if URL is relative.
1062 *
1063 * @param string $url
1064 * @return boolean
1065 */
1066 static protected function isRelativeURL($url) {
1067 $urlParts = @parse_url($url);
1068 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1069 }
1070
1071 /**
1072 * Checks if the path points to the file inside the web site
1073 *
1074 * @param string $filePath
1075 * @return boolean
1076 */
1077 static protected function isAllowedLocalFile($filePath) {
1078 $filePath = GeneralUtility::resolveBackPath($filePath);
1079 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1080 $isFile = is_file($filePath);
1081 return $insideWebPath && $isFile;
1082 }
1083
1084 /******************************************
1085 *
1086 * Indexing; external files (PDF, DOC, etc)
1087 *
1088 ******************************************/
1089 /**
1090 * Indexing a regular document given as $file (relative to PATH_site, local file)
1091 *
1092 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1093 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1094 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1095 * @param string File extension for temporary file.
1096 * @return void
1097 * @todo Define visibility
1098 */
1099 public function indexRegularDocument($file, $force = FALSE, $contentTmpFile = '', $altExtension = '') {
1100 // Init
1101 $fI = pathinfo($file);
1102 $ext = $altExtension ?: strtolower($fI['extension']);
1103 // Create abs-path:
1104 if (!$contentTmpFile) {
1105 if (!GeneralUtility::isAbsPath($file)) {
1106 // Relative, prepend PATH_site:
1107 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1108 } else {
1109 // Absolute, pass-through:
1110 $absFile = $file;
1111 }
1112 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1113 } else {
1114 $absFile = $contentTmpFile;
1115 }
1116 // Indexing the document:
1117 if ($absFile && @is_file($absFile)) {
1118 if ($this->external_parsers[$ext]) {
1119 $fileInfo = stat($absFile);
1120 $cParts = $this->fileContentParts($ext, $absFile);
1121 foreach ($cParts as $cPKey) {
1122 $this->internal_log = array();
1123 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1124 $Pstart = DateTimeUtility::milliseconds();
1125 $subinfo = array('key' => $cPKey);
1126 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1127 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1128 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1129 if ($check > 0 || $force) {
1130 if ($check > 0) {
1131 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1132 } else {
1133 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1134 }
1135 // Check external file counter:
1136 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1137 // Divide into title,keywords,description and body:
1138 $this->log_push('Split content', '');
1139 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1140 $this->log_pull();
1141 if (is_array($contentParts)) {
1142 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1143 $content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1144 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1145 // Increment counter:
1146 $this->externalFileCounter++;
1147 // Splitting words
1148 $this->log_push('Extract words from content', '');
1149 $splitInWords = $this->processWordsInArrays($contentParts);
1150 $this->log_pull();
1151 // Analyse the indexed words.
1152 $this->log_push('Analyse the extracted words', '');
1153 $indexArr = $this->indexAnalyze($splitInWords);
1154 $this->log_pull();
1155 // Submitting page (phash) record
1156 $this->log_push('Submitting page', '');
1157 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1158 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1159 $this->log_pull();
1160 // Check words and submit to word list if not there
1161 $this->log_push('Check word list and submit words', '');
1162 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1163 $this->checkWordList($indexArr);
1164 $this->submitWords($indexArr, $phash_arr['phash']);
1165 }
1166 $this->log_pull();
1167 // Set parsetime
1168 $this->updateParsetime($phash_arr['phash'], DateTimeUtility::milliseconds() - $Pstart);
1169 } else {
1170 // Update the timestamp
1171 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1172 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1173 }
1174 } else {
1175 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1176 }
1177 } else {
1178 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1179 }
1180 } else {
1181 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1182 }
1183 // Checking and setting sections:
1184 $this->submitFile_section($phash_arr['phash']);
1185 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1186 $this->log_pull();
1187 }
1188 } else {
1189 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1190 }
1191 } else {
1192 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1193 }
1194 }
1195
1196 /**
1197 * Reads the content of an external file being indexed.
1198 * The content from the external parser MUST be returned in utf-8!
1199 *
1200 * @param string File extension, eg. "pdf", "doc" etc.
1201 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1202 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1203 * @return array Standard content array (title, description, keywords, body keys)
1204 * @todo Define visibility
1205 */
1206 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1207 $contentArray = NULL;
1208 // Consult relevant external document parser:
1209 if (is_object($this->external_parsers[$fileExtension])) {
1210 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1211 }
1212 return $contentArray;
1213 }
1214
1215 /**
1216 * Creates an array with pointers to divisions of document.
1217 *
1218 * @param string File extension
1219 * @param string Absolute filename (must exist and be validated OK before calling function)
1220 * @return array Array of pointers to sections that the document should be divided into
1221 * @todo Define visibility
1222 */
1223 public function fileContentParts($ext, $absFile) {
1224 $cParts = array(0);
1225 // Consult relevant external document parser:
1226 if (is_object($this->external_parsers[$ext])) {
1227 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1228 }
1229 return $cParts;
1230 }
1231
1232 /**
1233 * Splits non-HTML content (from external files for instance)
1234 *
1235 * @param string Input content (non-HTML) to index.
1236 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1237 * @see splitHTMLContent()
1238 * @todo Define visibility
1239 */
1240 public function splitRegularContent($content) {
1241 $contentArr = $this->defaultContentArray;
1242 $contentArr['body'] = $content;
1243 return $contentArr;
1244 }
1245
1246 /**********************************
1247 *
1248 * Analysing content, Extracting words
1249 *
1250 **********************************/
1251 /**
1252 * Convert character set and HTML entities in the value of input content array keys
1253 *
1254 * @param array Standard content array
1255 * @param string Charset of the input content (converted to utf-8)
1256 * @return void
1257 * @todo Define visibility
1258 */
1259 public function charsetEntity2utf8(&$contentArr, $charset) {
1260 // Convert charset if necessary
1261 foreach ($contentArr as $key => $value) {
1262 if (strlen($contentArr[$key])) {
1263 if ($charset !== 'utf-8') {
1264 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1265 }
1266 // decode all numeric / html-entities in the string to real characters:
1267 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1268 }
1269 }
1270 }
1271
1272 /**
1273 * Processing words in the array from split*Content -functions
1274 *
1275 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1276 * @return array Content input array modified so each key is not a unique array of words
1277 * @todo Define visibility
1278 */
1279 public function processWordsInArrays($contentArr) {
1280 // split all parts to words
1281 foreach ($contentArr as $key => $value) {
1282 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1283 }
1284 // For title, keywords, and description we don't want duplicates:
1285 $contentArr['title'] = array_unique($contentArr['title']);
1286 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1287 $contentArr['description'] = array_unique($contentArr['description']);
1288 // Return modified array:
1289 return $contentArr;
1290 }
1291
1292 /**
1293 * Extracts the sample description text from the content array.
1294 *
1295 * @param array Content array
1296 * @return string Description string
1297 * @todo Define visibility
1298 */
1299 public function bodyDescription($contentArr) {
1300 // Setting description
1301 $maxL = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1302 if ($maxL) {
1303 $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1304 // Shorten the string:
1305 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1306 }
1307 return $bodyDescription;
1308 }
1309
1310 /**
1311 * Analyzes content to use for indexing,
1312 *
1313 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1314 * @return array Index Array (whatever that is...)
1315 * @todo Define visibility
1316 */
1317 public function indexAnalyze($content) {
1318 $indexArr = array();
1319 $counter = 0;
1320 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1321 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1322 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1323 $this->analyzeBody($indexArr, $content);
1324 return $indexArr;
1325 }
1326
1327 /**
1328 * Calculates relevant information for headercontent
1329 *
1330 * @param array Index array, passed by reference
1331 * @param array Standard content array
1332 * @param string Key from standard content array
1333 * @param integer Bit-wise priority to type
1334 * @return void
1335 * @todo Define visibility
1336 */
1337 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1338 foreach ($content[$key] as $val) {
1339 $val = substr($val, 0, 60);
1340 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1341 if (!isset($retArr[$val])) {
1342 // Word ID (wid)
1343 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1344 // Metaphone value is also 60 only chars long
1345 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1346 $retArr[$val]['metaphone'] = $metaphone;
1347 }
1348 // Build metaphone fulltext string (can be used for fulltext indexing)
1349 if ($this->storeMetaphoneInfoAsWords) {
1350 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1351 }
1352 // Priority used for flagBitMask feature (see extension configuration)
1353 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1354 // Increase number of occurences
1355 $retArr[$val]['count']++;
1356 $this->wordcount++;
1357 }
1358 }
1359
1360 /**
1361 * Calculates relevant information for bodycontent
1362 *
1363 * @param array Index array, passed by reference
1364 * @param array Standard content array
1365 * @return void
1366 * @todo Define visibility
1367 */
1368 public function analyzeBody(&$retArr, $content) {
1369 foreach ($content['body'] as $key => $val) {
1370 $val = substr($val, 0, 60);
1371 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1372 if (!isset($retArr[$val])) {
1373 // First occurence (used for ranking results)
1374 $retArr[$val]['first'] = $key;
1375 // Word ID (wid)
1376 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1377 // Metaphone value is also only 60 chars long
1378 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1379 $retArr[$val]['metaphone'] = $metaphone;
1380 }
1381 // Build metaphone fulltext string (can be used for fulltext indexing)
1382 if ($this->storeMetaphoneInfoAsWords) {
1383 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1384 }
1385 // Increase number of occurences
1386 $retArr[$val]['count']++;
1387 $this->wordcount++;
1388 }
1389 }
1390
1391 /**
1392 * Creating metaphone based hash from input word
1393 *
1394 * @param string Word to convert
1395 * @param boolean If set, returns the raw metaphone value (not hashed)
1396 * @return mixed Metaphone hash integer (or raw value, string)
1397 * @todo Define visibility
1398 */
1399 public function metaphone($word, $returnRawMetaphoneValue = FALSE) {
1400 if (is_object($this->metaphoneObj)) {
1401 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1402 } else {
1403 // Use native PHP function instead of advanced doubleMetaphone class
1404 $metaphoneRawValue = metaphone($word);
1405 }
1406 if ($returnRawMetaphoneValue) {
1407 $result = $metaphoneRawValue;
1408 } elseif (strlen($metaphoneRawValue)) {
1409 // Create hash and return integer
1410 $result = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($metaphoneRawValue);
1411 } else {
1412 $result = 0;
1413 }
1414 return $result;
1415 }
1416
1417 /********************************
1418 *
1419 * SQL; TYPO3 Pages
1420 *
1421 *******************************/
1422 /**
1423 * Updates db with information about the page (TYPO3 page, not external media)
1424 *
1425 * @return void
1426 * @todo Define visibility
1427 */
1428 public function submitPage() {
1429 // Remove any current data for this phash:
1430 $this->removeOldIndexedPages($this->hash['phash']);
1431 // setting new phash_row
1432 $fields = array(
1433 'phash' => $this->hash['phash'],
1434 'phash_grouping' => $this->hash['phash_grouping'],
1435 'cHashParams' => serialize($this->cHashParams),
1436 'contentHash' => $this->content_md5h,
1437 'data_page_id' => $this->conf['id'],
1438 'data_page_reg1' => $this->conf['page_cache_reg1'],
1439 'data_page_type' => $this->conf['type'],
1440 'data_page_mp' => $this->conf['MP'],
1441 'gr_list' => $this->conf['gr_list'],
1442 'item_type' => 0,
1443 // TYPO3 page
1444 'item_title' => $this->contentParts['title'],
1445 'item_description' => $this->bodyDescription($this->contentParts),
1446 'item_mtime' => $this->conf['mtime'],
1447 'item_size' => strlen($this->conf['content']),
1448 'tstamp' => $GLOBALS['EXEC_TIME'],
1449 'crdate' => $GLOBALS['EXEC_TIME'],
1450 'item_crdate' => $this->conf['crdate'],
1451 // Creation date of page
1452 'sys_language_uid' => $this->conf['sys_language_uid'],
1453 // Sys language uid of the page. Should reflect which language it DOES actually display!
1454 'externalUrl' => 0,
1455 'recordUid' => (int)$this->conf['recordUid'],
1456 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1457 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1458 );
1459 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1460 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1461 }
1462 // PROCESSING index_section
1463 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1464 // PROCESSING index_grlist
1465 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1466 // PROCESSING index_fulltext
1467 $fields = array(
1468 'phash' => $this->hash['phash'],
1469 'fulltextdata' => implode(' ', $this->contentParts),
1470 'metaphonedata' => $this->metaphoneContent
1471 );
1472 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1473 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1474 }
1475 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1476 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1477 }
1478 // PROCESSING index_debug
1479 if ($this->indexerConfig['debugMode']) {
1480 $fields = array(
1481 'phash' => $this->hash['phash'],
1482 'debuginfo' => serialize(array(
1483 'cHashParams' => $this->cHashParams,
1484 'external_parsers initialized' => array_keys($this->external_parsers),
1485 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1486 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1487 'logs' => $this->internal_log,
1488 'lexer' => $this->lexerObj->debugString
1489 ))
1490 );
1491 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1492 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1493 }
1494 }
1495 }
1496
1497 /**
1498 * Stores gr_list in the database.
1499 *
1500 * @param integer Search result record phash
1501 * @param integer Actual phash of current content
1502 * @return void
1503 * @see update_grlist()
1504 * @todo Define visibility
1505 */
1506 public function submit_grlist($hash, $phash_x) {
1507 // Setting the gr_list record
1508 $fields = array(
1509 'phash' => $hash,
1510 'phash_x' => $phash_x,
1511 'hash_gr_list' => \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1512 'gr_list' => $this->conf['gr_list']
1513 );
1514 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1515 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1516 }
1517 }
1518
1519 /**
1520 * Stores section
1521 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1522 *
1523 * @param integer phash of TYPO3 parent search result record
1524 * @param integer phash of the file indexation search record
1525 * @return void
1526 * @todo Define visibility
1527 */
1528 public function submit_section($hash, $hash_t3) {
1529 $fields = array(
1530 'phash' => $hash,
1531 'phash_t3' => $hash_t3,
1532 'page_id' => (int)$this->conf['id']
1533 );
1534 $this->getRootLineFields($fields);
1535 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1536 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1537 }
1538 }
1539
1540 /**
1541 * Removes records for the indexed page, $phash
1542 *
1543 * @param integer phash value to flush
1544 * @return void
1545 * @todo Define visibility
1546 */
1547 public function removeOldIndexedPages($phash) {
1548 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1549 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1550 foreach ($tableArray as $table) {
1551 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1552 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1553 }
1554 }
1555 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1556 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1557 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . (int)$phash);
1558 }
1559 }
1560
1561 /********************************
1562 *
1563 * SQL; External media
1564 *
1565 *******************************/
1566 /**
1567 * Updates db with information about the file
1568 *
1569 * @param array Array with phash and phash_grouping keys for file
1570 * @param string File name
1571 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1572 * @param string File extension determining the type of media.
1573 * @param integer Modification time of file.
1574 * @param integer Creation time of file.
1575 * @param integer Size of file in bytes
1576 * @param integer Content HASH value.
1577 * @param array Standard content array (using only title and body for a file)
1578 * @return void
1579 * @todo Define visibility
1580 */
1581 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1582 // Find item Type:
1583 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1584 $storeItemType = $storeItemType ?: $ext;
1585 // Remove any current data for this phash:
1586 $this->removeOldIndexedFiles($hash['phash']);
1587 // Split filename:
1588 $fileParts = parse_url($file);
1589 // Setting new
1590 $fields = array(
1591 'phash' => $hash['phash'],
1592 'phash_grouping' => $hash['phash_grouping'],
1593 'cHashParams' => serialize($subinfo),
1594 'contentHash' => $content_md5h,
1595 'data_filename' => $file,
1596 'item_type' => $storeItemType,
1597 'item_title' => trim($contentParts['title']) ?: basename($file),
1598 'item_description' => $this->bodyDescription($contentParts),
1599 'item_mtime' => $mtime,
1600 'item_size' => $size,
1601 'item_crdate' => $ctime,
1602 'tstamp' => $GLOBALS['EXEC_TIME'],
1603 'crdate' => $GLOBALS['EXEC_TIME'],
1604 'gr_list' => $this->conf['gr_list'],
1605 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1606 'recordUid' => (int)$this->conf['recordUid'],
1607 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1608 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1609 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1610 );
1611 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1612 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1613 }
1614 // PROCESSING index_fulltext
1615 $fields = array(
1616 'phash' => $hash['phash'],
1617 'fulltextdata' => implode(' ', $contentParts),
1618 'metaphonedata' => $this->metaphoneContent
1619 );
1620 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1621 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1622 }
1623 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1624 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1625 }
1626 // PROCESSING index_debug
1627 if ($this->indexerConfig['debugMode']) {
1628 $fields = array(
1629 'phash' => $hash['phash'],
1630 'debuginfo' => serialize(array(
1631 'cHashParams' => $subinfo,
1632 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1633 'logs' => $this->internal_log,
1634 'lexer' => $this->lexerObj->debugString
1635 ))
1636 );
1637 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1638 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1639 }
1640 }
1641 }
1642
1643 /**
1644 * Stores file gr_list for a file IF it does not exist already
1645 *
1646 * @param integer phash value of file
1647 * @return void
1648 * @todo Define visibility
1649 */
1650 public function submitFile_grlist($hash) {
1651 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1652 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1653 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$hash . ' AND (hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1654 if ($count == 0) {
1655 $this->submit_grlist($hash, $hash);
1656 }
1657 }
1658 }
1659
1660 /**
1661 * Stores file section for a file IF it does not exist
1662 *
1663 * @param integer phash value of file
1664 * @return void
1665 * @todo Define visibility
1666 */
1667 public function submitFile_section($hash) {
1668 // Testing if there is already a section
1669 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1670 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . (int)$hash . ' AND page_id=' . (int)$this->conf['id']);
1671 if ($count == 0) {
1672 $this->submit_section($hash, $this->hash['phash']);
1673 }
1674 }
1675 }
1676
1677 /**
1678 * Removes records for the indexed page, $phash
1679 *
1680 * @param integer phash value to flush
1681 * @return void
1682 * @todo Define visibility
1683 */
1684 public function removeOldIndexedFiles($phash) {
1685 // Removing old registrations for tables.
1686 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1687 foreach ($tableArray as $table) {
1688 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1689 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1690 }
1691 }
1692 }
1693
1694 /********************************
1695 *
1696 * SQL Helper functions
1697 *
1698 *******************************/
1699 /**
1700 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1701 * Return positive integer if the page needs to be indexed
1702 *
1703 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1704 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1705 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1706 * @todo Define visibility
1707 */
1708 public function checkMtimeTstamp($mtime, $phash) {
1709 if (!\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1710 // Not indexed (not in index_phash)
1711 $result = 4;
1712 } else {
1713 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . (int)$phash);
1714 // If there was an indexing of the page...:
1715 if ($row) {
1716 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1717 // If max age is exceeded, index the page
1718 // The configured max-age was exceeded for the document and thus it's indexed.
1719 $result = 1;
1720 } else {
1721 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1722 // if minAge is not set or if minAge is exceeded, consider at mtime
1723 if ($mtime) {
1724 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1725 if ($row['item_mtime'] != $mtime) {
1726 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1727 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1728 $result = 2;
1729 } else {
1730 // mtime matched the document, so no changes detected and no content updated
1731 $result = -1;
1732 if ($this->tstamp_maxAge) {
1733 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1734 } else {
1735 $this->updateTstamp($phash);
1736 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1737 }
1738 }
1739 } else {
1740 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1741 $result = 3;
1742 }
1743 } else {
1744 // The minimum age was not exceeded
1745 $result = -2;
1746 }
1747 }
1748 } else {
1749 // Page has never been indexed (is not represented in the index_phash table).
1750 $result = 4;
1751 }
1752 }
1753 return $result;
1754 }
1755
1756 /**
1757 * Check content hash in phash table
1758 *
1759 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1760 * @todo Define visibility
1761 */
1762 public function checkContentHash() {
1763 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1764 $result = TRUE;
1765 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1766 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . (int)$this->hash['phash_grouping'] . ' AND contentHash=' . (int)$this->content_md5h);
1767 if ($row) {
1768 $result = $row;
1769 }
1770 }
1771 return $result;
1772 }
1773
1774 /**
1775 * Check content hash for external documents
1776 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1777 *
1778 * @param integer phash value to check (phash_grouping)
1779 * @param integer Content hash to check
1780 * @return boolean Returns TRUE if the document needs to be indexed (that is, there was no result)
1781 * @todo Define visibility
1782 */
1783 public function checkExternalDocContentHash($hashGr, $content_md5h) {
1784 $result = TRUE;
1785 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1786 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . (int)$hashGr . ' AND contentHash=' . (int)$content_md5h);
1787 $result = $count == 0;
1788 }
1789 return $result;
1790 }
1791
1792 /**
1793 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1794 *
1795 * @param integer Phash integer to test.
1796 * @return boolean
1797 * @todo Define visibility
1798 */
1799 public function is_grlist_set($phash_x) {
1800 $result = FALSE;
1801 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1802 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . (int)$phash_x);
1803 $result = $count > 0;
1804 }
1805 return $result;
1806 }
1807
1808 /**
1809 * Check if an grlist-entry for this hash exists and if not so, write one.
1810 *
1811 * @param integer phash of the search result that should be found
1812 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1813 * @return void
1814 * @see submit_grlist()
1815 * @todo Define visibility
1816 */
1817 public function update_grlist($phash, $phash_x) {
1818 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1819 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$phash . ' AND hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1820 if ($count == 0) {
1821 $this->submit_grlist($phash, $phash_x);
1822 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1823 }
1824 }
1825 }
1826
1827 /**
1828 * Update tstamp for a phash row.
1829 *
1830 * @param integer phash value
1831 * @param integer If set, update the mtime field to this value.
1832 * @return void
1833 * @todo Define visibility
1834 */
1835 public function updateTstamp($phash, $mtime = 0) {
1836 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1837 $updateFields = array(
1838 'tstamp' => $GLOBALS['EXEC_TIME']
1839 );
1840 if ($mtime) {
1841 $updateFields['item_mtime'] = (int)$mtime;
1842 }
1843 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1844 }
1845 }
1846
1847 /**
1848 * Update SetID of the index_phash record.
1849 *
1850 * @param integer phash value
1851 * @return void
1852 * @todo Define visibility
1853 */
1854 public function updateSetId($phash) {
1855 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1856 $updateFields = array(
1857 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1858 );
1859 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1860 }
1861 }
1862
1863 /**
1864 * Update parsetime for phash row.
1865 *
1866 * @param integer phash value.
1867 * @param integer Parsetime value to set.
1868 * @return void
1869 * @todo Define visibility
1870 */
1871 public function updateParsetime($phash, $parsetime) {
1872 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1873 $updateFields = array(
1874 'parsetime' => (int)$parsetime
1875 );
1876 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1877 }
1878 }
1879
1880 /**
1881 * Update section rootline for the page
1882 *
1883 * @return void
1884 * @todo Define visibility
1885 */
1886 public function updateRootline() {
1887 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1888 $updateFields = array();
1889 $this->getRootLineFields($updateFields);
1890 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . (int)$this->conf['id'], $updateFields);
1891 }
1892 }
1893
1894 /**
1895 * Adding values for root-line fields.
1896 * rl0, rl1 and rl2 are standard. A hook might add more.
1897 *
1898 * @param array Field array, passed by reference
1899 * @return void
1900 * @todo Define visibility
1901 */
1902 public function getRootLineFields(array &$fieldArray) {
1903 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1904 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1905 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1906 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1907 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1908 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1909 }
1910 }
1911 }
1912
1913 /**
1914 * Removes any indexed pages with userlogins which has the same contentHash
1915 * NOT USED anywhere inside this class!
1916 *
1917 * @return void
1918 * @todo Define visibility
1919 */
1920 public function removeLoginpagesWithContentHash() {
1921 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash') && \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1922 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1923 A.phash=B.phash
1924 AND A.phash_grouping=' . (int)$this->hash['phash_grouping'] . '
1925 AND B.hash_gr_list<>' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . '
1926 AND A.contentHash=' . (int)$this->content_md5h);
1927 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1928 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1929 $this->removeOldIndexedPages($row['phash']);
1930 }
1931 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1932 }
1933 }
1934
1935 /**
1936 * Includes the crawler class
1937 *
1938 * @return void
1939 * @todo Define visibility
1940 */
1941 public function includeCrawlerClass() {
1942 GeneralUtility::requireOnce(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php');
1943 }
1944
1945 /********************************
1946 *
1947 * SQL; Submitting words
1948 *
1949 *******************************/
1950 /**
1951 * Adds new words to db
1952 *
1953 * @param array $wordListArray Word List array (where each word has information about position etc).
1954 * @return void
1955 * @todo Define visibility
1956 */
1957 public function checkWordList($wordListArray) {
1958 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1959 if (count($wordListArray)) {
1960 $phashArray = array();
1961 foreach ($wordListArray as $value) {
1962 $phashArray[] = (int)$value['hash'];
1963 }
1964 $cwl = implode(',', $phashArray);
1965 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1966 if ($count != count($wordListArray)) {
1967 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1968 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
1969 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1970 unset($wordListArray[$row['baseword']]);
1971 }
1972 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1973 foreach ($wordListArray as $key => $val) {
1974 $insertFields = array(
1975 'wid' => $val['hash'],
1976 'baseword' => $key,
1977 'metaphone' => $val['metaphone']
1978 );
1979 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1980 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1981 }
1982 }
1983 }
1984 }
1985 }
1986
1987 /**
1988 * Submits RELATIONS between words and phash
1989 *
1990 * @param array Word list array
1991 * @param integer phash value
1992 * @return void
1993 * @todo Define visibility
1994 */
1995 public function submitWords($wordList, $phash) {
1996 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_rel')) {
1997 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . (int)$phash);
1998 foreach ($wordList as $val) {
1999 $insertFields = array(
2000 'phash' => (int)$phash,
2001 'wid' => (int)$val['hash'],
2002 'count' => (int)$val['count'],
2003 'first' => (int)$val['first'],
2004 'freq' => $this->freqMap($val['count'] / $this->wordcount),
2005 'flags' => $val['cmp'] & $this->flagBitMask
2006 );
2007 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2008 }
2009 }
2010 }
2011
2012 /**
2013 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2014 * and back.
2015 *
2016 * @param double Frequency
2017 * @return integer Frequency in range.
2018 * @todo Define visibility
2019 */
2020 public function freqMap($freq) {
2021 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2022 if ($freq < 1) {
2023 $newFreq = $freq * $mapFactor;
2024 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2025 } else {
2026 $newFreq = $freq / $mapFactor;
2027 }
2028 return $newFreq;
2029 }
2030
2031 /********************************
2032 *
2033 * Hashing
2034 *
2035 *******************************/
2036 /**
2037 * Get search hash, T3 pages
2038 *
2039 * @return void
2040 * @todo Define visibility
2041 */
2042 public function setT3Hashes() {
2043 // Set main array:
2044 $hArray = array(
2045 'id' => (int)$this->conf['id'],
2046 'type' => (int)$this->conf['type'],
2047 'sys_lang' => (int)$this->conf['sys_language_uid'],
2048 'MP' => (string) $this->conf['MP'],
2049 'cHash' => $this->cHashParams
2050 );
2051 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2052 $this->hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2053 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2054 $hArray['gr_list'] = (string) $this->conf['gr_list'];
2055 $this->hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2056 }
2057
2058 /**
2059 * Get search hash, external files
2060 *
2061 * @param string File name / path which identifies it on the server
2062 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2063 * @return array Array with "phash_grouping" and "phash" inside.
2064 * @todo Define visibility
2065 */
2066 public function setExtHashes($file, $subinfo = array()) {
2067 // Set main array:
2068 $hash = array();
2069 $hArray = array(
2070 'file' => $file
2071 );
2072 // Set grouping hash:
2073 $hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2074 // Add subinfo
2075 $hArray['subinfo'] = $subinfo;
2076 $hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2077 return $hash;
2078 }
2079
2080 /*********************************
2081 *
2082 * Internal logging functions
2083 *
2084 *********************************/
2085 /**
2086 * Push function wrapper for TT logging
2087 *
2088 * @param string Title to set
2089 * @param string Key (?)
2090 * @return void
2091 * @todo Define visibility
2092 */
2093 public function log_push($msg, $key) {
2094 if (is_object($GLOBALS['TT'])) {
2095 $GLOBALS['TT']->push($msg, $key);
2096 }
2097 }
2098
2099 /**
2100 * Pull function wrapper for TT logging
2101 *
2102 * @return void
2103 * @todo Define visibility
2104 */
2105 public function log_pull() {
2106 if (is_object($GLOBALS['TT'])) {
2107 $GLOBALS['TT']->pull();
2108 }
2109 }
2110
2111 /**
2112 * Set log message function wrapper for TT logging
2113 *
2114 * @param string Message to set
2115 * @param integer Error number
2116 * @return void
2117 * @todo Define visibility
2118 */
2119 public function log_setTSlogMessage($msg, $errorNum = 0) {
2120 if (is_object($GLOBALS['TT'])) {
2121 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2122 }
2123 $this->internal_log[] = $msg;
2124 }
2125
2126 /**************************
2127 *
2128 * \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController hooks:
2129 *
2130 **************************/
2131 /**
2132 * Makes sure that keywords are space-separated. This is impotant for their
2133 * proper displaying as a part of fulltext index.
2134 *
2135 * @param string $keywordList
2136 * @return string
2137 * @see http://forge.typo3.org/issues/14959
2138 */
2139 protected function addSpacesToKeywordList($keywordList) {
2140 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2141 return ' ' . implode(', ', $keywords) . ' ';
2142 }
2143
2144 }