[BUGFIX] indexed_search/Indexer: correct frequency mapping for 1
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /**
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18
19 /**
20 * This class is a search indexer for TYPO3
21 *
22 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
23 */
24 /**
25 * Indexing class for TYPO3 frontend
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 */
29 class Indexer {
30
31 // Messages:
32 /**
33 * @todo Define visibility
34 */
35 public $reasons = array(
36 -1 => 'mtime matched the document, so no changes detected and no content updated',
37 -2 => 'The minimum age was not exceeded',
38 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
39 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
40 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
41 4 => 'Page has never been indexed (is not represented in the index_phash table).'
42 );
43
44 // HTML code blocks to exclude from indexing:
45 /**
46 * @todo Define visibility
47 */
48 public $excludeSections = 'script,style';
49
50 // Supported Extensions for external files:
51 /**
52 * @todo Define visibility
53 */
54 public $external_parsers = array();
55
56 // External parser objects, keys are file extension names. Values are objects with certain methods.
57 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
58 /**
59 * @todo Define visibility
60 */
61 public $defaultGrList = '0,-1';
62
63 // Min/Max times:
64 /**
65 * @todo Define visibility
66 */
67 public $tstamp_maxAge = 0;
68
69 // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
70 /**
71 * @todo Define visibility
72 */
73 public $tstamp_minAge = 0;
74
75 // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
76 /**
77 * @todo Define visibility
78 */
79 public $maxExternalFiles = 0;
80
81 // Max number of external files to index.
82 /**
83 * @todo Define visibility
84 */
85 public $forceIndexing = FALSE;
86
87 // If TRUE, indexing is forced despite of hashes etc.
88 /**
89 * @todo Define visibility
90 */
91 public $crawlerActive = FALSE;
92
93 // Set when crawler is detected (internal)
94 // INTERNALS:
95 /**
96 * @todo Define visibility
97 */
98 public $defaultContentArray = array(
99 'title' => '',
100 'description' => '',
101 'keywords' => '',
102 'body' => ''
103 );
104
105 /**
106 * @todo Define visibility
107 */
108 public $wordcount = 0;
109
110 /**
111 * @todo Define visibility
112 */
113 public $externalFileCounter = 0;
114
115 /**
116 * @todo Define visibility
117 */
118 public $conf = array();
119
120 // Configuration set internally (see init functions for required keys and their meaning)
121 /**
122 * @todo Define visibility
123 */
124 public $indexerConfig = array();
125
126 // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
127 /**
128 * @todo Define visibility
129 */
130 public $hash = array();
131
132 // Hash array, contains phash and phash_grouping
133 /**
134 * @todo Define visibility
135 */
136 public $file_phash_arr = array();
137
138 // Hash array for files
139 /**
140 * @todo Define visibility
141 */
142 public $contentParts = array();
143
144 // Content of TYPO3 page
145 /**
146 * @todo Define visibility
147 */
148 public $content_md5h = '';
149
150 /**
151 * @todo Define visibility
152 */
153 public $internal_log = array();
154
155 // Internal log
156 /**
157 * @todo Define visibility
158 */
159 public $indexExternalUrl_content = '';
160
161 /**
162 * @todo Define visibility
163 */
164 public $cHashParams = array();
165
166 // cHashparams array
167 /**
168 * @todo Define visibility
169 */
170 public $freqRange = 32000;
171
172 /**
173 * @todo Define visibility
174 */
175 public $freqMax = 0.1;
176
177 /**
178 * @todo Define visibility
179 */
180 public $enableMetaphoneSearch = FALSE;
181
182 /**
183 * @todo Define visibility
184 */
185 public $storeMetaphoneInfoAsWords;
186
187 /**
188 * @todo Define visibility
189 */
190 public $metaphoneContent = '';
191
192 // Objects:
193 /**
194 * Charset class object
195 *
196 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
197 * @todo Define visibility
198 */
199 public $csObj;
200
201 /**
202 * Metaphone object, if any
203 *
204 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
205 * @todo Define visibility
206 */
207 public $metaphoneObj;
208
209 /**
210 * Lexer object for word splitting
211 *
212 * @var \TYPO3\CMS\IndexedSearch\Lexer
213 * @todo Define visibility
214 */
215 public $lexerObj;
216
217 /**
218 * @todo Define visibility
219 */
220 public $flagBitMask;
221
222 /**
223 * Parent Object (TSFE) Initialization
224 *
225 * @param object Parent Object (frontend TSFE object), passed by reference
226 * @return void
227 * @todo Define visibility
228 */
229 public function hook_indexContent(&$pObj) {
230 // Indexer configuration from Extension Manager interface:
231 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
232 // Crawler activation:
233 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
234 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
235 // Setting simple log message:
236 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
237 // Setting variables:
238 $this->crawlerActive = TRUE;
239 // Crawler active flag
240 $this->forceIndexing = TRUE;
241 }
242 // Determine if page should be indexed, and if so, configure and initialize indexer
243 if ($pObj->config['config']['index_enable']) {
244 $this->log_push('Index page', '');
245 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
246 if (!$pObj->page['no_search']) {
247 if (!$pObj->no_cache) {
248 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
249 // Setting up internal configuration from config array:
250 $this->conf = array();
251 // Information about page for which the indexing takes place
252 $this->conf['id'] = $pObj->id;
253 // Page id
254 $this->conf['type'] = $pObj->type;
255 // Page type
256 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
257 // sys_language UID of the language of the indexing.
258 $this->conf['MP'] = $pObj->MP;
259 // MP variable, if any (Mount Points)
260 $this->conf['gr_list'] = $pObj->gr_list;
261 // Group list
262 $this->conf['cHash'] = $pObj->cHash;
263 // cHash string for additional parameters
264 $this->conf['cHash_array'] = $pObj->cHash_array;
265 // Array of the additional parameters
266 $this->conf['crdate'] = $pObj->page['crdate'];
267 // The creation date of the TYPO3 page
268 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
269 // reg1 of the caching table. Not known what practical use this has.
270 // Root line uids
271 $this->conf['rootline_uids'] = array();
272 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
273 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
274 }
275 // Content of page:
276 $this->conf['content'] = $pObj->content;
277 // Content string (HTML of TYPO3 page)
278 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
279 // Alternative title for indexing
280 $this->conf['metaCharset'] = $pObj->metaCharset;
281 // Character set of content (will be converted to utf-8 during indexing)
282 $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
283 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
284 // Configuration of behavior:
285 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
286 // Whether to index external documents like PDF, DOC etc. (if possible)
287 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
288 // Length of description text (max 250, default 200)
289 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
290 // Set to zero:
291 $this->conf['recordUid'] = 0;
292 $this->conf['freeIndexUid'] = 0;
293 $this->conf['freeIndexSetId'] = 0;
294 // Init and start indexing:
295 $this->init();
296 $this->indexTypo3PageContent();
297 } else {
298 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
299 }
300 } else {
301 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
302 }
303 } else {
304 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
305 }
306 } else {
307 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
308 }
309 $this->log_pull();
310 }
311 }
312
313 /****************************
314 *
315 * Backend API
316 *
317 ****************************/
318 /**
319 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
320 *
321 * @param integer The page uid, &id=
322 * @param integer The page type, &type=
323 * @param integer sys_language uid, typically &L=
324 * @param string The MP variable (Mount Points), &MP=
325 * @param array Rootline array of only UIDs.
326 * @param array Array of GET variables to register with this indexing
327 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
328 * @return void
329 * @todo Define visibility
330 */
331 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = FALSE) {
332 // Setting up internal configuration from config array:
333 $this->conf = array();
334 // Information about page for which the indexing takes place
335 $this->conf['id'] = $id;
336 // Page id (integer)
337 $this->conf['type'] = $type;
338 // Page type (integer)
339 $this->conf['sys_language_uid'] = $sys_language_uid;
340 // sys_language UID of the language of the indexing (integer)
341 $this->conf['MP'] = $MP;
342 // MP variable, if any (Mount Points) (string)
343 $this->conf['gr_list'] = '0,-1';
344 // Group list (hardcoded for now...)
345 // cHash values:
346 if ($createCHash) {
347 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
348 $cacheHash = GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\CacheHashCalculator');
349 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
350 } else {
351 $this->conf['cHash'] = '';
352 }
353 // cHash string for additional parameters
354 $this->conf['cHash_array'] = $cHash_array;
355 // Array of the additional parameters
356 // Set to defaults
357 $this->conf['freeIndexUid'] = 0;
358 $this->conf['freeIndexSetId'] = 0;
359 $this->conf['page_cache_reg1'] = '';
360 // Root line uids
361 $this->conf['rootline_uids'] = $uidRL;
362 // Configuration of behavior:
363 $this->conf['index_externals'] = 1;
364 // Whether to index external documents like PDF, DOC etc. (if possible)
365 $this->conf['index_descrLgd'] = 200;
366 // Length of description text (max 250, default 200)
367 $this->conf['index_metatags'] = TRUE;
368 // Whether to index document keywords and description (if present)
369 // Init and start indexing:
370 $this->init();
371 }
372
373 /**
374 * Sets the free-index uid. Can be called right after backend_initIndexer()
375 *
376 * @param integer Free index UID
377 * @param integer Set id - an integer identifying the "set" of indexing operations.
378 * @return void
379 * @todo Define visibility
380 */
381 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0) {
382 $this->conf['freeIndexUid'] = $freeIndexUid;
383 $this->conf['freeIndexSetId'] = $freeIndexSetId;
384 }
385
386 /**
387 * Indexing records as the content of a TYPO3 page.
388 *
389 * @param string Title equivalent
390 * @param string Keywords equivalent
391 * @param string Description equivalent
392 * @param string The main content to index
393 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
394 * @param integer Last modification time, in seconds
395 * @param integer The creation date of the content, in seconds
396 * @param integer The record UID that the content comes from (for registration with the indexed rows)
397 * @return void
398 * @todo Define visibility
399 */
400 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0) {
401 // Content of page:
402 $this->conf['mtime'] = $mtime;
403 // Most recent modification time (seconds) of the content
404 $this->conf['crdate'] = $crdate;
405 // The creation date of the TYPO3 content
406 $this->conf['recordUid'] = $recordUid;
407 // UID of the record, if applicable
408 // Construct fake HTML for parsing:
409 $this->conf['content'] = '
410 <html>
411 <head>
412 <title>' . htmlspecialchars($title) . '</title>
413 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
414 <meta name="description" content="' . htmlspecialchars($description) . '" />
415 </head>
416 <body>
417 ' . htmlspecialchars($content) . '
418 </body>
419 </html>';
420 // Content string (HTML of TYPO3 page)
421 // Initializing charset:
422 $this->conf['metaCharset'] = $charset;
423 // Character set of content (will be converted to utf-8 during indexing)
424 $this->conf['indexedDocTitle'] = '';
425 // Alternative title for indexing
426 // Index content as if it was a TYPO3 page:
427 $this->indexTypo3PageContent();
428 }
429
430 /********************************
431 *
432 * Initialization
433 *
434 *******************************/
435 /**
436 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
437 *
438 * @return void
439 * @todo Define visibility
440 */
441 public function init() {
442 global $TYPO3_CONF_VARS;
443 // Initializing:
444 $this->cHashParams = $this->conf['cHash_array'];
445 if (is_array($this->cHashParams) && count($this->cHashParams)) {
446 if ($this->conf['cHash']) {
447 // Add this so that URL's come out right...
448 $this->cHashParams['cHash'] = $this->conf['cHash'];
449 }
450 unset($this->cHashParams['encryptionKey']);
451 }
452 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
453 $this->setT3Hashes();
454 // Indexer configuration from Extension Manager interface:
455 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
456 $this->tstamp_minAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
457 $this->tstamp_maxAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
458 $this->maxExternalFiles = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
459 $this->flagBitMask = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
460 // Workaround: If the extension configuration was not updated yet, the value is not existing
461 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
462 $this->storeMetaphoneInfoAsWords = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
463 // Initialize external document parsers:
464 // Example configuration, see ext_localconf.php of this file!
465 if ($this->conf['index_externals']) {
466 $this->initializeExternalParsers();
467 }
468 // Initialize lexer (class that deconstructs the text into words):
469 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ? $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
470 $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
471 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
472 // Initialize metaphone hook:
473 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
474 if ($this->enableMetaphoneSearch && $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
475 $this->metaphoneObj = GeneralUtility::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
476 $this->metaphoneObj->pObj = $this;
477 }
478 // Init charset class:
479 $this->csObj = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Charset\\CharsetConverter');
480 }
481
482 /**
483 * Initialize external parsers
484 *
485 * @return void
486 * @access private
487 * @see init()
488 * @todo Define visibility
489 */
490 public function initializeExternalParsers() {
491 global $TYPO3_CONF_VARS;
492 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
493 foreach ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
494 $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
495 $this->external_parsers[$extension]->pObj = $this;
496 // Init parser and if it returns FALSE, unset its entry again:
497 if (!$this->external_parsers[$extension]->initParser($extension)) {
498 unset($this->external_parsers[$extension]);
499 }
500 }
501 }
502 }
503
504 /********************************
505 *
506 * Indexing; TYPO3 pages (HTML content)
507 *
508 *******************************/
509 /**
510 * Start indexing of the TYPO3 page
511 *
512 * @return void
513 * @todo Define visibility
514 */
515 public function indexTypo3PageContent() {
516 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
517 $is_grlist = $this->is_grlist_set($this->hash['phash']);
518 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
519 // Setting message:
520 if ($this->forceIndexing) {
521 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
522 } elseif ($check > 0) {
523 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
524 } else {
525 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
526 }
527 // Divide into title,keywords,description and body:
528 $this->log_push('Split content', '');
529 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
530 if ($this->conf['indexedDocTitle']) {
531 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
532 }
533 $this->log_pull();
534 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
535 $this->content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
536 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
537 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
538 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
539 $checkCHash = $this->checkContentHash();
540 if (!is_array($checkCHash) || $check === 1) {
541 $Pstart = GeneralUtility::milliseconds();
542 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
543 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
544 $this->log_pull();
545 // Splitting words
546 $this->log_push('Extract words from content', '');
547 $splitInWords = $this->processWordsInArrays($this->contentParts);
548 $this->log_pull();
549 // Analyse the indexed words.
550 $this->log_push('Analyse the extracted words', '');
551 $indexArr = $this->indexAnalyze($splitInWords);
552 $this->log_pull();
553 // Submitting page (phash) record
554 $this->log_push('Submitting page', '');
555 $this->submitPage();
556 $this->log_pull();
557 // Check words and submit to word list if not there
558 $this->log_push('Check word list and submit words', '');
559 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
560 $this->checkWordList($indexArr);
561 $this->submitWords($indexArr, $this->hash['phash']);
562 }
563 $this->log_pull();
564 // Set parsetime
565 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
566 // Checking external files if configured for.
567 $this->log_push('Checking external files', '');
568 if ($this->conf['index_externals']) {
569 $this->extractLinks($this->conf['content']);
570 }
571 $this->log_pull();
572 } else {
573 // Update the timestamp
574 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
575 $this->updateSetId($this->hash['phash']);
576 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
577 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
578 $this->updateRootline();
579 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
580 }
581 } else {
582 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
583 }
584 }
585
586 /**
587 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
588 *
589 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
590 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
591 * @see splitRegularContent()
592 * @todo Define visibility
593 */
594 public function splitHTMLContent($content) {
595 // divide head from body ( u-ouh :) )
596 $contentArr = $this->defaultContentArray;
597 $contentArr['body'] = stristr($content, '<body');
598 $headPart = substr($content, 0, -strlen($contentArr['body']));
599 // get title
600 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
601 $titleParts = explode(':', $contentArr['title'], 2);
602 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
603 // get keywords and description metatags
604 if ($this->conf['index_metatags']) {
605 $meta = array();
606 $i = 0;
607 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
608 $i++;
609 }
610 // TODO The code below stops at first unset tag. Is that correct?
611 for ($i = 0; isset($meta[$i]); $i++) {
612 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
613 if (stristr($meta[$i]['name'], 'keywords')) {
614 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
615 }
616 if (stristr($meta[$i]['name'], 'description')) {
617 $contentArr['description'] .= ',' . $meta[$i]['content'];
618 }
619 }
620 }
621 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
622 $this->typoSearchTags($contentArr['body']);
623 // Get rid of unwanted sections (ie. scripting and style stuff) in body
624 $tagList = explode(',', $this->excludeSections);
625 foreach ($tagList as $tag) {
626 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
627
628 }
629 }
630 // remove tags, but first make sure we don't concatenate words by doing it
631 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
632 $contentArr['body'] = trim(strip_tags($contentArr['body']));
633 $contentArr['keywords'] = trim($contentArr['keywords']);
634 $contentArr['description'] = trim($contentArr['description']);
635 // Return array
636 return $contentArr;
637 }
638
639 /**
640 * Extract the charset value from HTML meta tag.
641 *
642 * @param string HTML content
643 * @return string The charset value if found.
644 * @todo Define visibility
645 */
646 public function getHTMLcharset($content) {
647 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
648 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
649 return $reg2[1];
650 }
651 }
652 }
653
654 /**
655 * Converts a HTML document to utf-8
656 *
657 * @param string HTML content, any charset
658 * @param string Optional charset (otherwise extracted from HTML)
659 * @return string Converted HTML
660 * @todo Define visibility
661 */
662 public function convertHTMLToUtf8($content, $charset = '') {
663 // Find charset:
664 $charset = $charset ?: $this->getHTMLcharset($content);
665 $charset = $this->csObj->parse_charset($charset);
666 // Convert charset:
667 if ($charset && $charset !== 'utf-8') {
668 $content = $this->csObj->utf8_encode($content, $charset);
669 }
670 // Convert entities, assuming document is now UTF-8:
671 $content = $this->csObj->entities_to_utf8($content, TRUE);
672 return $content;
673 }
674
675 /**
676 * Finds first occurence of embracing tags and returns the embraced content and the original string with
677 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
678 * <title> of document or removing <script>-sections
679 *
680 * @param string String to search in
681 * @param string Tag name, eg. "script
682 * @param string Passed by reference: Content inside found tag
683 * @param string Passed by reference: Content after found tag
684 * @param string Passed by reference: Attributes of the found tag.
685 * @return boolean Returns FALSE if tag was not found, otherwise TRUE.
686 * @todo Define visibility
687 */
688 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
689 $endTag = '</' . $tagName . '>';
690 $startTag = '<' . $tagName;
691 // stristr used because we want a case-insensitive search for the tag.
692 $isTagInText = stristr($string, $startTag);
693 // if the tag was not found, return FALSE
694 if (!$isTagInText) {
695 return FALSE;
696 }
697 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
698 $afterTagInText = stristr($isTagInText, $endTag);
699 if ($afterTagInText) {
700 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
701 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
702 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
703 } else {
704 $tagContent = '';
705 $stringAfter = $isTagInText;
706 }
707 return TRUE;
708 }
709
710 /**
711 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
712 *
713 * @param string HTML Content, passed by reference
714 * @return boolean Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
715 * @todo Define visibility
716 */
717 public function typoSearchTags(&$body) {
718 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
719 if (count($expBody) > 1) {
720 $body = '';
721 foreach ($expBody as $val) {
722 $part = explode('-->', $val, 2);
723 if (trim($part[0]) == 'begin') {
724 $body .= $part[1];
725 $prev = '';
726 } elseif (trim($part[0]) == 'end') {
727 $body .= $prev;
728 } else {
729 $prev = $val;
730 }
731 }
732 return TRUE;
733 } else {
734 return FALSE;
735 }
736 }
737
738 /**
739 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
740 *
741 * @param string HTML content
742 * @return void
743 * @todo Define visibility
744 */
745 public function extractLinks($content) {
746 // Get links:
747 $list = $this->extractHyperLinks($content);
748 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
749 $this->includeCrawlerClass();
750 $crawler = GeneralUtility::makeInstance('tx_crawler_lib');
751 }
752 // Traverse links:
753 foreach ($list as $linkInfo) {
754 // Decode entities:
755 if ($linkInfo['localPath']) {
756 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
757 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
758 } else {
759 $linkSource = htmlspecialchars_decode($linkInfo['href']);
760 }
761 // Parse URL:
762 $qParts = parse_url($linkSource);
763 // Check for jumpurl (TYPO3 specific thing...)
764 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
765 parse_str($qParts['query'], $getP);
766 $linkSource = $getP['jumpurl'];
767 $qParts = parse_url($linkSource);
768 }
769 if (!$linkInfo['localPath'] && $qParts['scheme']) {
770 if ($this->indexerConfig['indexExternalURLs']) {
771 // Index external URL (http or otherwise)
772 $this->indexExternalUrl($linkSource);
773 }
774 } elseif (!$qParts['query']) {
775 $linkSource = urldecode($linkSource);
776 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
777 $localFile = $linkSource;
778 } else {
779 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
780 }
781 if ($localFile && @is_file($localFile)) {
782 // Index local file:
783 if ($linkInfo['localPath']) {
784 $fI = pathinfo($linkSource);
785 $ext = strtolower($fI['extension']);
786 if (is_object($crawler)) {
787 $params = array(
788 'document' => $linkSource,
789 'alturl' => $linkInfo['href'],
790 'conf' => $this->conf
791 );
792 unset($params['conf']['content']);
793 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
794 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
795 } else {
796 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
797 }
798 } else {
799 if (is_object($crawler)) {
800 $params = array(
801 'document' => $linkSource,
802 'conf' => $this->conf
803 );
804 unset($params['conf']['content']);
805 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
806 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
807 } else {
808 $this->indexRegularDocument($linkSource);
809 }
810 }
811 }
812 }
813 }
814 }
815
816 /**
817 * Extracts all links to external documents from the HTML content string
818 *
819 * @param string $html
820 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
821 * @see extractLinks()
822 * @todo Define visibility
823 */
824 public function extractHyperLinks($html) {
825 $htmlParser = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Html\\HtmlParser');
826 $htmlParts = $htmlParser->splitTags('a', $html);
827 $hyperLinksData = array();
828 foreach ($htmlParts as $index => $tagData) {
829 if ($index % 2 !== 0) {
830 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
831 $firstTagName = $htmlParser->getFirstTagName($tagData);
832 if (strtolower($firstTagName) == 'a') {
833 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
834 $hyperLinksData[] = array(
835 'tag' => $tagData,
836 'href' => $tagAttributes[0]['href'],
837 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
838 );
839 }
840 }
841 }
842 }
843 return $hyperLinksData;
844 }
845
846 /**
847 * Extracts the "base href" from content string.
848 *
849 * @param string Content to analyze
850 * @return string The base href or an empty string if not found
851 */
852 public function extractBaseHref($html) {
853 $href = '';
854 $htmlParser = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Html\\HtmlParser');
855 $htmlParts = $htmlParser->splitTags('base', $html);
856 foreach ($htmlParts as $index => $tagData) {
857 if ($index % 2 !== 0) {
858 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
859 $firstTagName = $htmlParser->getFirstTagName($tagData);
860 if (strtolower($firstTagName) == 'base') {
861 $href = $tagAttributes[0]['href'];
862 if ($href) {
863 break;
864 }
865 }
866 }
867 }
868 return $href;
869 }
870
871 /******************************************
872 *
873 * Indexing; external URL
874 *
875 ******************************************/
876 /**
877 * Index External URLs HTML content
878 *
879 * @param string URL, eg. "http://typo3.org/
880 * @return void
881 * @see indexRegularDocument()
882 * @todo Define visibility
883 */
884 public function indexExternalUrl($externalUrl) {
885 // Parse External URL:
886 $qParts = parse_url($externalUrl);
887 $fI = pathinfo($qParts['path']);
888 $ext = strtolower($fI['extension']);
889 // Get headers:
890 $urlHeaders = $this->getUrlHeaders($externalUrl);
891 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
892 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
893 if (strlen($content)) {
894 // Create temporary file:
895 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
896 if ($tmpFile) {
897 GeneralUtility::writeFile($tmpFile, $content);
898 // Index that file:
899 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
900 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
901 unlink($tmpFile);
902 }
903 }
904 }
905 }
906
907 /**
908 * Getting HTTP request headers of URL
909 *
910 * @param string The URL
911 * @param integer Timeout (seconds?)
912 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
913 * @todo Define visibility
914 */
915 public function getUrlHeaders($url) {
916 // Try to get the headers only
917 $content = GeneralUtility::getUrl($url, 2);
918 if (strlen($content)) {
919 // Compile headers:
920 $headers = GeneralUtility::trimExplode(LF, $content, TRUE);
921 $retVal = array();
922 foreach ($headers as $line) {
923 if (!strlen(trim($line))) {
924 break;
925 }
926 list($headKey, $headValue) = explode(':', $line, 2);
927 $retVal[$headKey] = $headValue;
928 }
929 return $retVal;
930 }
931 }
932
933 /**
934 * Checks if the file is local
935 *
936 * @param $sourcePath
937 * @return string Absolute path to file if file is local, else empty string
938 */
939 protected function createLocalPath($sourcePath) {
940 $localPath = '';
941 static $pathFunctions = array(
942 'createLocalPathFromT3vars',
943 'createLocalPathUsingAbsRefPrefix',
944 'createLocalPathUsingDomainURL',
945 'createLocalPathFromAbsoluteURL',
946 'createLocalPathFromRelativeURL'
947 );
948 foreach ($pathFunctions as $functionName) {
949 $localPath = $this->{$functionName}($sourcePath);
950 if ($localPath != '') {
951 break;
952 }
953 }
954 return $localPath;
955 }
956
957 /**
958 * Attempts to create a local file path from T3VARs. This is useful for
959 * various download extensions that hide actual file name but still want the
960 * file to be indexed.
961 *
962 * @param string $sourcePath
963 * @return string
964 */
965 protected function createLocalPathFromT3vars($sourcePath) {
966 $localPath = '';
967 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
968 if (is_array($indexLocalFiles)) {
969 $md5 = GeneralUtility::shortMD5($sourcePath);
970 // Note: not using self::isAllowedLocalFile here because this method
971 // is allowed to index files outside of the web site (for example,
972 // protected downloads)
973 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
974 $localPath = $indexLocalFiles[$md5];
975 }
976 }
977 return $localPath;
978 }
979
980 /**
981 * Attempts to create a local file path by matching a current request URL.
982 *
983 * @param string $sourcePath
984 * @return string
985 */
986 protected function createLocalPathUsingDomainURL($sourcePath) {
987 $localPath = '';
988 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
989 $baseURLLength = strlen($baseURL);
990 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
991 $sourcePath = substr($sourcePath, $baseURLLength);
992 $localPath = PATH_site . $sourcePath;
993 if (!self::isAllowedLocalFile($localPath)) {
994 $localPath = '';
995 }
996 }
997 return $localPath;
998 }
999
1000 /**
1001 * Attempts to create a local file path by matching absRefPrefix. This
1002 * requires TSFE. If TSFE is missing, this function does nothing.
1003 *
1004 * @param string $sourcePath
1005 * @return string
1006 */
1007 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
1008 $localPath = '';
1009 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1010 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1011 $absRefPrefixLength = strlen($absRefPrefix);
1012 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1013 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1014 $localPath = PATH_site . $sourcePath;
1015 if (!self::isAllowedLocalFile($localPath)) {
1016 $localPath = '';
1017 }
1018 }
1019 }
1020 return $localPath;
1021 }
1022
1023 /**
1024 * Attempts to create a local file path from the absolute URL without
1025 * schema.
1026 *
1027 * @param string $sourcePath
1028 * @return string
1029 */
1030 protected function createLocalPathFromAbsoluteURL($sourcePath) {
1031 $localPath = '';
1032 if ($sourcePath[0] == '/') {
1033 $sourcePath = substr($sourcePath, 1);
1034 $localPath = PATH_site . $sourcePath;
1035 if (!self::isAllowedLocalFile($localPath)) {
1036 $localPath = '';
1037 }
1038 }
1039 return $localPath;
1040 }
1041
1042 /**
1043 * Attempts to create a local file path from the relative URL.
1044 *
1045 * @param string $sourcePath
1046 * @return string
1047 */
1048 protected function createLocalPathFromRelativeURL($sourcePath) {
1049 $localPath = '';
1050 if (self::isRelativeURL($sourcePath)) {
1051 $localPath = PATH_site . $sourcePath;
1052 if (!self::isAllowedLocalFile($localPath)) {
1053 $localPath = '';
1054 }
1055 }
1056 return $localPath;
1057 }
1058
1059 /**
1060 * Checks if URL is relative.
1061 *
1062 * @param string $url
1063 * @return boolean
1064 */
1065 static protected function isRelativeURL($url) {
1066 $urlParts = @parse_url($url);
1067 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1068 }
1069
1070 /**
1071 * Checks if the path points to the file inside the web site
1072 *
1073 * @param string $filePath
1074 * @return boolean
1075 */
1076 static protected function isAllowedLocalFile($filePath) {
1077 $filePath = GeneralUtility::resolveBackPath($filePath);
1078 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1079 $isFile = is_file($filePath);
1080 return $insideWebPath && $isFile;
1081 }
1082
1083 /******************************************
1084 *
1085 * Indexing; external files (PDF, DOC, etc)
1086 *
1087 ******************************************/
1088 /**
1089 * Indexing a regular document given as $file (relative to PATH_site, local file)
1090 *
1091 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1092 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1093 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1094 * @param string File extension for temporary file.
1095 * @return void
1096 * @todo Define visibility
1097 */
1098 public function indexRegularDocument($file, $force = FALSE, $contentTmpFile = '', $altExtension = '') {
1099 // Init
1100 $fI = pathinfo($file);
1101 $ext = $altExtension ?: strtolower($fI['extension']);
1102 // Create abs-path:
1103 if (!$contentTmpFile) {
1104 if (!GeneralUtility::isAbsPath($file)) {
1105 // Relative, prepend PATH_site:
1106 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1107 } else {
1108 // Absolute, pass-through:
1109 $absFile = $file;
1110 }
1111 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1112 } else {
1113 $absFile = $contentTmpFile;
1114 }
1115 // Indexing the document:
1116 if ($absFile && @is_file($absFile)) {
1117 if ($this->external_parsers[$ext]) {
1118 $fileInfo = stat($absFile);
1119 $cParts = $this->fileContentParts($ext, $absFile);
1120 foreach ($cParts as $cPKey) {
1121 $this->internal_log = array();
1122 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1123 $Pstart = GeneralUtility::milliseconds();
1124 $subinfo = array('key' => $cPKey);
1125 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1126 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1127 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1128 if ($check > 0 || $force) {
1129 if ($check > 0) {
1130 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1131 } else {
1132 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1133 }
1134 // Check external file counter:
1135 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1136 // Divide into title,keywords,description and body:
1137 $this->log_push('Split content', '');
1138 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1139 $this->log_pull();
1140 if (is_array($contentParts)) {
1141 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1142 $content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1143 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1144 // Increment counter:
1145 $this->externalFileCounter++;
1146 // Splitting words
1147 $this->log_push('Extract words from content', '');
1148 $splitInWords = $this->processWordsInArrays($contentParts);
1149 $this->log_pull();
1150 // Analyse the indexed words.
1151 $this->log_push('Analyse the extracted words', '');
1152 $indexArr = $this->indexAnalyze($splitInWords);
1153 $this->log_pull();
1154 // Submitting page (phash) record
1155 $this->log_push('Submitting page', '');
1156 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1157 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1158 $this->log_pull();
1159 // Check words and submit to word list if not there
1160 $this->log_push('Check word list and submit words', '');
1161 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1162 $this->checkWordList($indexArr);
1163 $this->submitWords($indexArr, $phash_arr['phash']);
1164 }
1165 $this->log_pull();
1166 // Set parsetime
1167 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1168 } else {
1169 // Update the timestamp
1170 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1171 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1172 }
1173 } else {
1174 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1175 }
1176 } else {
1177 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1178 }
1179 } else {
1180 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1181 }
1182 // Checking and setting sections:
1183 $this->submitFile_section($phash_arr['phash']);
1184 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1185 $this->log_pull();
1186 }
1187 } else {
1188 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1189 }
1190 } else {
1191 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1192 }
1193 }
1194
1195 /**
1196 * Reads the content of an external file being indexed.
1197 * The content from the external parser MUST be returned in utf-8!
1198 *
1199 * @param string File extension, eg. "pdf", "doc" etc.
1200 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1201 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1202 * @return array Standard content array (title, description, keywords, body keys)
1203 * @todo Define visibility
1204 */
1205 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1206 $contentArray = NULL;
1207 // Consult relevant external document parser:
1208 if (is_object($this->external_parsers[$fileExtension])) {
1209 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1210 }
1211 return $contentArray;
1212 }
1213
1214 /**
1215 * Creates an array with pointers to divisions of document.
1216 *
1217 * @param string File extension
1218 * @param string Absolute filename (must exist and be validated OK before calling function)
1219 * @return array Array of pointers to sections that the document should be divided into
1220 * @todo Define visibility
1221 */
1222 public function fileContentParts($ext, $absFile) {
1223 $cParts = array(0);
1224 // Consult relevant external document parser:
1225 if (is_object($this->external_parsers[$ext])) {
1226 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1227 }
1228 return $cParts;
1229 }
1230
1231 /**
1232 * Splits non-HTML content (from external files for instance)
1233 *
1234 * @param string Input content (non-HTML) to index.
1235 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1236 * @see splitHTMLContent()
1237 * @todo Define visibility
1238 */
1239 public function splitRegularContent($content) {
1240 $contentArr = $this->defaultContentArray;
1241 $contentArr['body'] = $content;
1242 return $contentArr;
1243 }
1244
1245 /**********************************
1246 *
1247 * Analysing content, Extracting words
1248 *
1249 **********************************/
1250 /**
1251 * Convert character set and HTML entities in the value of input content array keys
1252 *
1253 * @param array Standard content array
1254 * @param string Charset of the input content (converted to utf-8)
1255 * @return void
1256 * @todo Define visibility
1257 */
1258 public function charsetEntity2utf8(&$contentArr, $charset) {
1259 // Convert charset if necessary
1260 foreach ($contentArr as $key => $value) {
1261 if (strlen($contentArr[$key])) {
1262 if ($charset !== 'utf-8') {
1263 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1264 }
1265 // decode all numeric / html-entities in the string to real characters:
1266 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1267 }
1268 }
1269 }
1270
1271 /**
1272 * Processing words in the array from split*Content -functions
1273 *
1274 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1275 * @return array Content input array modified so each key is not a unique array of words
1276 * @todo Define visibility
1277 */
1278 public function processWordsInArrays($contentArr) {
1279 // split all parts to words
1280 foreach ($contentArr as $key => $value) {
1281 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1282 }
1283 // For title, keywords, and description we don't want duplicates:
1284 $contentArr['title'] = array_unique($contentArr['title']);
1285 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1286 $contentArr['description'] = array_unique($contentArr['description']);
1287 // Return modified array:
1288 return $contentArr;
1289 }
1290
1291 /**
1292 * Extracts the sample description text from the content array.
1293 *
1294 * @param array Content array
1295 * @return string Description string
1296 * @todo Define visibility
1297 */
1298 public function bodyDescription($contentArr) {
1299 // Setting description
1300 $maxL = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1301 if ($maxL) {
1302 $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1303 // Shorten the string:
1304 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1305 }
1306 return $bodyDescription;
1307 }
1308
1309 /**
1310 * Analyzes content to use for indexing,
1311 *
1312 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1313 * @return array Index Array (whatever that is...)
1314 * @todo Define visibility
1315 */
1316 public function indexAnalyze($content) {
1317 $indexArr = array();
1318 $counter = 0;
1319 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1320 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1321 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1322 $this->analyzeBody($indexArr, $content);
1323 return $indexArr;
1324 }
1325
1326 /**
1327 * Calculates relevant information for headercontent
1328 *
1329 * @param array Index array, passed by reference
1330 * @param array Standard content array
1331 * @param string Key from standard content array
1332 * @param integer Bit-wise priority to type
1333 * @return void
1334 * @todo Define visibility
1335 */
1336 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1337 foreach ($content[$key] as $val) {
1338 $val = substr($val, 0, 60);
1339 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1340 if (!isset($retArr[$val])) {
1341 // Word ID (wid)
1342 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1343 // Metaphone value is also 60 only chars long
1344 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1345 $retArr[$val]['metaphone'] = $metaphone;
1346 }
1347 // Build metaphone fulltext string (can be used for fulltext indexing)
1348 if ($this->storeMetaphoneInfoAsWords) {
1349 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1350 }
1351 // Priority used for flagBitMask feature (see extension configuration)
1352 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1353 // Increase number of occurences
1354 $retArr[$val]['count']++;
1355 $this->wordcount++;
1356 }
1357 }
1358
1359 /**
1360 * Calculates relevant information for bodycontent
1361 *
1362 * @param array Index array, passed by reference
1363 * @param array Standard content array
1364 * @return void
1365 * @todo Define visibility
1366 */
1367 public function analyzeBody(&$retArr, $content) {
1368 foreach ($content['body'] as $key => $val) {
1369 $val = substr($val, 0, 60);
1370 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1371 if (!isset($retArr[$val])) {
1372 // First occurence (used for ranking results)
1373 $retArr[$val]['first'] = $key;
1374 // Word ID (wid)
1375 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1376 // Metaphone value is also only 60 chars long
1377 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1378 $retArr[$val]['metaphone'] = $metaphone;
1379 }
1380 // Build metaphone fulltext string (can be used for fulltext indexing)
1381 if ($this->storeMetaphoneInfoAsWords) {
1382 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1383 }
1384 // Increase number of occurences
1385 $retArr[$val]['count']++;
1386 $this->wordcount++;
1387 }
1388 }
1389
1390 /**
1391 * Creating metaphone based hash from input word
1392 *
1393 * @param string Word to convert
1394 * @param boolean If set, returns the raw metaphone value (not hashed)
1395 * @return mixed Metaphone hash integer (or raw value, string)
1396 * @todo Define visibility
1397 */
1398 public function metaphone($word, $returnRawMetaphoneValue = FALSE) {
1399 if (is_object($this->metaphoneObj)) {
1400 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1401 } else {
1402 // Use native PHP function instead of advanced doubleMetaphone class
1403 $metaphoneRawValue = metaphone($word);
1404 }
1405 if ($returnRawMetaphoneValue) {
1406 $result = $metaphoneRawValue;
1407 } elseif (strlen($metaphoneRawValue)) {
1408 // Create hash and return integer
1409 $result = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($metaphoneRawValue);
1410 } else {
1411 $result = 0;
1412 }
1413 return $result;
1414 }
1415
1416 /********************************
1417 *
1418 * SQL; TYPO3 Pages
1419 *
1420 *******************************/
1421 /**
1422 * Updates db with information about the page (TYPO3 page, not external media)
1423 *
1424 * @return void
1425 * @todo Define visibility
1426 */
1427 public function submitPage() {
1428 // Remove any current data for this phash:
1429 $this->removeOldIndexedPages($this->hash['phash']);
1430 // setting new phash_row
1431 $fields = array(
1432 'phash' => $this->hash['phash'],
1433 'phash_grouping' => $this->hash['phash_grouping'],
1434 'cHashParams' => serialize($this->cHashParams),
1435 'contentHash' => $this->content_md5h,
1436 'data_page_id' => $this->conf['id'],
1437 'data_page_reg1' => $this->conf['page_cache_reg1'],
1438 'data_page_type' => $this->conf['type'],
1439 'data_page_mp' => $this->conf['MP'],
1440 'gr_list' => $this->conf['gr_list'],
1441 'item_type' => 0,
1442 // TYPO3 page
1443 'item_title' => $this->contentParts['title'],
1444 'item_description' => $this->bodyDescription($this->contentParts),
1445 'item_mtime' => (int) $this->conf['mtime'],
1446 'item_size' => strlen($this->conf['content']),
1447 'tstamp' => $GLOBALS['EXEC_TIME'],
1448 'crdate' => $GLOBALS['EXEC_TIME'],
1449 'item_crdate' => $this->conf['crdate'],
1450 // Creation date of page
1451 'sys_language_uid' => $this->conf['sys_language_uid'],
1452 // Sys language uid of the page. Should reflect which language it DOES actually display!
1453 'externalUrl' => 0,
1454 'recordUid' => (int)$this->conf['recordUid'],
1455 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1456 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1457 );
1458 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1459 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1460 }
1461 // PROCESSING index_section
1462 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1463 // PROCESSING index_grlist
1464 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1465 // PROCESSING index_fulltext
1466 $fields = array(
1467 'phash' => $this->hash['phash'],
1468 'fulltextdata' => implode(' ', $this->contentParts),
1469 'metaphonedata' => $this->metaphoneContent
1470 );
1471 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1472 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1473 }
1474 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1475 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1476 }
1477 // PROCESSING index_debug
1478 if ($this->indexerConfig['debugMode']) {
1479 $fields = array(
1480 'phash' => $this->hash['phash'],
1481 'debuginfo' => serialize(array(
1482 'cHashParams' => $this->cHashParams,
1483 'external_parsers initialized' => array_keys($this->external_parsers),
1484 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1485 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1486 'logs' => $this->internal_log,
1487 'lexer' => $this->lexerObj->debugString
1488 ))
1489 );
1490 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1491 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1492 }
1493 }
1494 }
1495
1496 /**
1497 * Stores gr_list in the database.
1498 *
1499 * @param integer Search result record phash
1500 * @param integer Actual phash of current content
1501 * @return void
1502 * @see update_grlist()
1503 * @todo Define visibility
1504 */
1505 public function submit_grlist($hash, $phash_x) {
1506 // Setting the gr_list record
1507 $fields = array(
1508 'phash' => $hash,
1509 'phash_x' => $phash_x,
1510 'hash_gr_list' => \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1511 'gr_list' => $this->conf['gr_list']
1512 );
1513 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1514 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1515 }
1516 }
1517
1518 /**
1519 * Stores section
1520 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1521 *
1522 * @param integer phash of TYPO3 parent search result record
1523 * @param integer phash of the file indexation search record
1524 * @return void
1525 * @todo Define visibility
1526 */
1527 public function submit_section($hash, $hash_t3) {
1528 $fields = array(
1529 'phash' => $hash,
1530 'phash_t3' => $hash_t3,
1531 'page_id' => (int)$this->conf['id']
1532 );
1533 $this->getRootLineFields($fields);
1534 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1535 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1536 }
1537 }
1538
1539 /**
1540 * Removes records for the indexed page, $phash
1541 *
1542 * @param integer phash value to flush
1543 * @return void
1544 * @todo Define visibility
1545 */
1546 public function removeOldIndexedPages($phash) {
1547 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1548 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1549 foreach ($tableArray as $table) {
1550 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1551 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1552 }
1553 }
1554 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1555 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1556 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . (int)$phash);
1557 }
1558 }
1559
1560 /********************************
1561 *
1562 * SQL; External media
1563 *
1564 *******************************/
1565 /**
1566 * Updates db with information about the file
1567 *
1568 * @param array Array with phash and phash_grouping keys for file
1569 * @param string File name
1570 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1571 * @param string File extension determining the type of media.
1572 * @param integer Modification time of file.
1573 * @param integer Creation time of file.
1574 * @param integer Size of file in bytes
1575 * @param integer Content HASH value.
1576 * @param array Standard content array (using only title and body for a file)
1577 * @return void
1578 * @todo Define visibility
1579 */
1580 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1581 // Find item Type:
1582 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1583 $storeItemType = $storeItemType ?: $ext;
1584 // Remove any current data for this phash:
1585 $this->removeOldIndexedFiles($hash['phash']);
1586 // Split filename:
1587 $fileParts = parse_url($file);
1588 // Setting new
1589 $fields = array(
1590 'phash' => $hash['phash'],
1591 'phash_grouping' => $hash['phash_grouping'],
1592 'cHashParams' => serialize($subinfo),
1593 'contentHash' => $content_md5h,
1594 'data_filename' => $file,
1595 'item_type' => $storeItemType,
1596 'item_title' => trim($contentParts['title']) ?: basename($file),
1597 'item_description' => $this->bodyDescription($contentParts),
1598 'item_mtime' => $mtime,
1599 'item_size' => $size,
1600 'item_crdate' => $ctime,
1601 'tstamp' => $GLOBALS['EXEC_TIME'],
1602 'crdate' => $GLOBALS['EXEC_TIME'],
1603 'gr_list' => $this->conf['gr_list'],
1604 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1605 'recordUid' => (int)$this->conf['recordUid'],
1606 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1607 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1608 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1609 );
1610 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1611 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1612 }
1613 // PROCESSING index_fulltext
1614 $fields = array(
1615 'phash' => $hash['phash'],
1616 'fulltextdata' => implode(' ', $contentParts),
1617 'metaphonedata' => $this->metaphoneContent
1618 );
1619 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1620 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1621 }
1622 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1623 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1624 }
1625 // PROCESSING index_debug
1626 if ($this->indexerConfig['debugMode']) {
1627 $fields = array(
1628 'phash' => $hash['phash'],
1629 'debuginfo' => serialize(array(
1630 'cHashParams' => $subinfo,
1631 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1632 'logs' => $this->internal_log,
1633 'lexer' => $this->lexerObj->debugString
1634 ))
1635 );
1636 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1637 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1638 }
1639 }
1640 }
1641
1642 /**
1643 * Stores file gr_list for a file IF it does not exist already
1644 *
1645 * @param integer phash value of file
1646 * @return void
1647 * @todo Define visibility
1648 */
1649 public function submitFile_grlist($hash) {
1650 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1651 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1652 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$hash . ' AND (hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1653 if ($count == 0) {
1654 $this->submit_grlist($hash, $hash);
1655 }
1656 }
1657 }
1658
1659 /**
1660 * Stores file section for a file IF it does not exist
1661 *
1662 * @param integer phash value of file
1663 * @return void
1664 * @todo Define visibility
1665 */
1666 public function submitFile_section($hash) {
1667 // Testing if there is already a section
1668 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1669 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . (int)$hash . ' AND page_id=' . (int)$this->conf['id']);
1670 if ($count == 0) {
1671 $this->submit_section($hash, $this->hash['phash']);
1672 }
1673 }
1674 }
1675
1676 /**
1677 * Removes records for the indexed page, $phash
1678 *
1679 * @param integer phash value to flush
1680 * @return void
1681 * @todo Define visibility
1682 */
1683 public function removeOldIndexedFiles($phash) {
1684 // Removing old registrations for tables.
1685 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1686 foreach ($tableArray as $table) {
1687 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1688 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1689 }
1690 }
1691 }
1692
1693 /********************************
1694 *
1695 * SQL Helper functions
1696 *
1697 *******************************/
1698 /**
1699 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1700 * Return positive integer if the page needs to be indexed
1701 *
1702 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1703 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1704 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1705 * @todo Define visibility
1706 */
1707 public function checkMtimeTstamp($mtime, $phash) {
1708 if (!\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1709 // Not indexed (not in index_phash)
1710 $result = 4;
1711 } else {
1712 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . (int)$phash);
1713 // If there was an indexing of the page...:
1714 if ($row) {
1715 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1716 // If max age is exceeded, index the page
1717 // The configured max-age was exceeded for the document and thus it's indexed.
1718 $result = 1;
1719 } else {
1720 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1721 // if minAge is not set or if minAge is exceeded, consider at mtime
1722 if ($mtime) {
1723 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1724 if ($row['item_mtime'] != $mtime) {
1725 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1726 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1727 $result = 2;
1728 } else {
1729 // mtime matched the document, so no changes detected and no content updated
1730 $result = -1;
1731 if ($this->tstamp_maxAge) {
1732 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1733 } else {
1734 $this->updateTstamp($phash);
1735 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1736 }
1737 }
1738 } else {
1739 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1740 $result = 3;
1741 }
1742 } else {
1743 // The minimum age was not exceeded
1744 $result = -2;
1745 }
1746 }
1747 } else {
1748 // Page has never been indexed (is not represented in the index_phash table).
1749 $result = 4;
1750 }
1751 }
1752 return $result;
1753 }
1754
1755 /**
1756 * Check content hash in phash table
1757 *
1758 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1759 * @todo Define visibility
1760 */
1761 public function checkContentHash() {
1762 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1763 $result = TRUE;
1764 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1765 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . (int)$this->hash['phash_grouping'] . ' AND contentHash=' . (int)$this->content_md5h);
1766 if ($row) {
1767 $result = $row;
1768 }
1769 }
1770 return $result;
1771 }
1772
1773 /**
1774 * Check content hash for external documents
1775 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1776 *
1777 * @param integer phash value to check (phash_grouping)
1778 * @param integer Content hash to check
1779 * @return boolean Returns TRUE if the document needs to be indexed (that is, there was no result)
1780 * @todo Define visibility
1781 */
1782 public function checkExternalDocContentHash($hashGr, $content_md5h) {
1783 $result = TRUE;
1784 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1785 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . (int)$hashGr . ' AND contentHash=' . (int)$content_md5h);
1786 $result = $count == 0;
1787 }
1788 return $result;
1789 }
1790
1791 /**
1792 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1793 *
1794 * @param integer Phash integer to test.
1795 * @return boolean
1796 * @todo Define visibility
1797 */
1798 public function is_grlist_set($phash_x) {
1799 $result = FALSE;
1800 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1801 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . (int)$phash_x);
1802 $result = $count > 0;
1803 }
1804 return $result;
1805 }
1806
1807 /**
1808 * Check if an grlist-entry for this hash exists and if not so, write one.
1809 *
1810 * @param integer phash of the search result that should be found
1811 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1812 * @return void
1813 * @see submit_grlist()
1814 * @todo Define visibility
1815 */
1816 public function update_grlist($phash, $phash_x) {
1817 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1818 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$phash . ' AND hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1819 if ($count == 0) {
1820 $this->submit_grlist($phash, $phash_x);
1821 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1822 }
1823 }
1824 }
1825
1826 /**
1827 * Update tstamp for a phash row.
1828 *
1829 * @param integer phash value
1830 * @param integer If set, update the mtime field to this value.
1831 * @return void
1832 * @todo Define visibility
1833 */
1834 public function updateTstamp($phash, $mtime = 0) {
1835 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1836 $updateFields = array(
1837 'tstamp' => $GLOBALS['EXEC_TIME']
1838 );
1839 if ($mtime) {
1840 $updateFields['item_mtime'] = (int)$mtime;
1841 }
1842 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1843 }
1844 }
1845
1846 /**
1847 * Update SetID of the index_phash record.
1848 *
1849 * @param integer phash value
1850 * @return void
1851 * @todo Define visibility
1852 */
1853 public function updateSetId($phash) {
1854 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1855 $updateFields = array(
1856 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1857 );
1858 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1859 }
1860 }
1861
1862 /**
1863 * Update parsetime for phash row.
1864 *
1865 * @param integer phash value.
1866 * @param integer Parsetime value to set.
1867 * @return void
1868 * @todo Define visibility
1869 */
1870 public function updateParsetime($phash, $parsetime) {
1871 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1872 $updateFields = array(
1873 'parsetime' => (int)$parsetime
1874 );
1875 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1876 }
1877 }
1878
1879 /**
1880 * Update section rootline for the page
1881 *
1882 * @return void
1883 * @todo Define visibility
1884 */
1885 public function updateRootline() {
1886 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1887 $updateFields = array();
1888 $this->getRootLineFields($updateFields);
1889 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . (int)$this->conf['id'], $updateFields);
1890 }
1891 }
1892
1893 /**
1894 * Adding values for root-line fields.
1895 * rl0, rl1 and rl2 are standard. A hook might add more.
1896 *
1897 * @param array Field array, passed by reference
1898 * @return void
1899 * @todo Define visibility
1900 */
1901 public function getRootLineFields(array &$fieldArray) {
1902 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1903 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1904 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1905 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1906 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1907 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1908 }
1909 }
1910 }
1911
1912 /**
1913 * Removes any indexed pages with userlogins which has the same contentHash
1914 * NOT USED anywhere inside this class!
1915 *
1916 * @return void
1917 * @todo Define visibility
1918 */
1919 public function removeLoginpagesWithContentHash() {
1920 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash') && \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1921 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1922 A.phash=B.phash
1923 AND A.phash_grouping=' . (int)$this->hash['phash_grouping'] . '
1924 AND B.hash_gr_list<>' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . '
1925 AND A.contentHash=' . (int)$this->content_md5h);
1926 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1927 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1928 $this->removeOldIndexedPages($row['phash']);
1929 }
1930 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1931 }
1932 }
1933
1934 /**
1935 * Includes the crawler class
1936 *
1937 * @return void
1938 * @todo Define visibility
1939 */
1940 public function includeCrawlerClass() {
1941 GeneralUtility::requireOnce(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php');
1942 }
1943
1944 /********************************
1945 *
1946 * SQL; Submitting words
1947 *
1948 *******************************/
1949 /**
1950 * Adds new words to db
1951 *
1952 * @param array $wordListArray Word List array (where each word has information about position etc).
1953 * @return void
1954 * @todo Define visibility
1955 */
1956 public function checkWordList($wordListArray) {
1957 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1958 if (count($wordListArray)) {
1959 $phashArray = array();
1960 foreach ($wordListArray as $value) {
1961 $phashArray[] = (int)$value['hash'];
1962 }
1963 $cwl = implode(',', $phashArray);
1964 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1965 if ($count != count($wordListArray)) {
1966 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1967 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
1968 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1969 unset($wordListArray[$row['baseword']]);
1970 }
1971 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1972 foreach ($wordListArray as $key => $val) {
1973 $insertFields = array(
1974 'wid' => $val['hash'],
1975 'baseword' => $key,
1976 'metaphone' => $val['metaphone']
1977 );
1978 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1979 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1980 }
1981 }
1982 }
1983 }
1984 }
1985
1986 /**
1987 * Submits RELATIONS between words and phash
1988 *
1989 * @param array Word list array
1990 * @param integer phash value
1991 * @return void
1992 * @todo Define visibility
1993 */
1994 public function submitWords($wordList, $phash) {
1995 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_rel')) {
1996 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . (int)$phash);
1997 foreach ($wordList as $val) {
1998 $insertFields = array(
1999 'phash' => (int)$phash,
2000 'wid' => (int)$val['hash'],
2001 'count' => (int)$val['count'],
2002 'first' => (int)$val['first'],
2003 'freq' => $this->freqMap($val['count'] / $this->wordcount),
2004 'flags' => $val['cmp'] & $this->flagBitMask
2005 );
2006 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2007 }
2008 }
2009 }
2010
2011 /**
2012 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2013 * and back.
2014 *
2015 * @param double Frequency
2016 * @return integer Frequency in range.
2017 * @todo Define visibility
2018 */
2019 public function freqMap($freq) {
2020 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2021 if ($freq <= 1) {
2022 $newFreq = $freq * $mapFactor;
2023 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2024 } else {
2025 $newFreq = $freq / $mapFactor;
2026 }
2027 return $newFreq;
2028 }
2029
2030 /********************************
2031 *
2032 * Hashing
2033 *
2034 *******************************/
2035 /**
2036 * Get search hash, T3 pages
2037 *
2038 * @return void
2039 * @todo Define visibility
2040 */
2041 public function setT3Hashes() {
2042 // Set main array:
2043 $hArray = array(
2044 'id' => (int)$this->conf['id'],
2045 'type' => (int)$this->conf['type'],
2046 'sys_lang' => (int)$this->conf['sys_language_uid'],
2047 'MP' => (string) $this->conf['MP'],
2048 'cHash' => $this->cHashParams
2049 );
2050 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2051 $this->hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2052 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2053 $hArray['gr_list'] = (string) $this->conf['gr_list'];
2054 $this->hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2055 }
2056
2057 /**
2058 * Get search hash, external files
2059 *
2060 * @param string File name / path which identifies it on the server
2061 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2062 * @return array Array with "phash_grouping" and "phash" inside.
2063 * @todo Define visibility
2064 */
2065 public function setExtHashes($file, $subinfo = array()) {
2066 // Set main array:
2067 $hash = array();
2068 $hArray = array(
2069 'file' => $file
2070 );
2071 // Set grouping hash:
2072 $hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2073 // Add subinfo
2074 $hArray['subinfo'] = $subinfo;
2075 $hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2076 return $hash;
2077 }
2078
2079 /*********************************
2080 *
2081 * Internal logging functions
2082 *
2083 *********************************/
2084 /**
2085 * Push function wrapper for TT logging
2086 *
2087 * @param string Title to set
2088 * @param string Key (?)
2089 * @return void
2090 * @todo Define visibility
2091 */
2092 public function log_push($msg, $key) {
2093 if (is_object($GLOBALS['TT'])) {
2094 $GLOBALS['TT']->push($msg, $key);
2095 }
2096 }
2097
2098 /**
2099 * Pull function wrapper for TT logging
2100 *
2101 * @return void
2102 * @todo Define visibility
2103 */
2104 public function log_pull() {
2105 if (is_object($GLOBALS['TT'])) {
2106 $GLOBALS['TT']->pull();
2107 }
2108 }
2109
2110 /**
2111 * Set log message function wrapper for TT logging
2112 *
2113 * @param string Message to set
2114 * @param integer Error number
2115 * @return void
2116 * @todo Define visibility
2117 */
2118 public function log_setTSlogMessage($msg, $errorNum = 0) {
2119 if (is_object($GLOBALS['TT'])) {
2120 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2121 }
2122 $this->internal_log[] = $msg;
2123 }
2124
2125 /**************************
2126 *
2127 * \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController hooks:
2128 *
2129 **************************/
2130 /**
2131 * Makes sure that keywords are space-separated. This is impotant for their
2132 * proper displaying as a part of fulltext index.
2133 *
2134 * @param string $keywordList
2135 * @return string
2136 * @see http://forge.typo3.org/issues/14959
2137 */
2138 protected function addSpacesToKeywordList($keywordList) {
2139 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2140 return ' ' . implode(', ', $keywords) . ' ';
2141 }
2142
2143 }