[BUGFIX] Indexer tries to insert NULL into DB
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2001-2013 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the TYPO3 project. The TYPO3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 * A copy is found in the textfile GPL.txt and important notices to the license
19 * from the author is found in LICENSE.txt distributed with these scripts.
20 *
21 *
22 * This script is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * This copyright notice MUST APPEAR in all copies of the script!
28 ***************************************************************/
29
30 use TYPO3\CMS\Core\Utility\GeneralUtility;
31
32 /**
33 * This class is a search indexer for TYPO3
34 *
35 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
36 */
37 /**
38 * Indexing class for TYPO3 frontend
39 *
40 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
41 */
42 class Indexer {
43
44 // Messages:
45 /**
46 * @todo Define visibility
47 */
48 public $reasons = array(
49 -1 => 'mtime matched the document, so no changes detected and no content updated',
50 -2 => 'The minimum age was not exceeded',
51 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
52 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
53 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
54 4 => 'Page has never been indexed (is not represented in the index_phash table).'
55 );
56
57 // HTML code blocks to exclude from indexing:
58 /**
59 * @todo Define visibility
60 */
61 public $excludeSections = 'script,style';
62
63 // Supported Extensions for external files:
64 /**
65 * @todo Define visibility
66 */
67 public $external_parsers = array();
68
69 // External parser objects, keys are file extension names. Values are objects with certain methods.
70 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
71 /**
72 * @todo Define visibility
73 */
74 public $defaultGrList = '0,-1';
75
76 // Min/Max times:
77 /**
78 * @todo Define visibility
79 */
80 public $tstamp_maxAge = 0;
81
82 // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
83 /**
84 * @todo Define visibility
85 */
86 public $tstamp_minAge = 0;
87
88 // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
89 /**
90 * @todo Define visibility
91 */
92 public $maxExternalFiles = 0;
93
94 // Max number of external files to index.
95 /**
96 * @todo Define visibility
97 */
98 public $forceIndexing = FALSE;
99
100 // If TRUE, indexing is forced despite of hashes etc.
101 /**
102 * @todo Define visibility
103 */
104 public $crawlerActive = FALSE;
105
106 // Set when crawler is detected (internal)
107 // INTERNALS:
108 /**
109 * @todo Define visibility
110 */
111 public $defaultContentArray = array(
112 'title' => '',
113 'description' => '',
114 'keywords' => '',
115 'body' => ''
116 );
117
118 /**
119 * @todo Define visibility
120 */
121 public $wordcount = 0;
122
123 /**
124 * @todo Define visibility
125 */
126 public $externalFileCounter = 0;
127
128 /**
129 * @todo Define visibility
130 */
131 public $conf = array();
132
133 // Configuration set internally (see init functions for required keys and their meaning)
134 /**
135 * @todo Define visibility
136 */
137 public $indexerConfig = array();
138
139 // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
140 /**
141 * @todo Define visibility
142 */
143 public $hash = array();
144
145 // Hash array, contains phash and phash_grouping
146 /**
147 * @todo Define visibility
148 */
149 public $file_phash_arr = array();
150
151 // Hash array for files
152 /**
153 * @todo Define visibility
154 */
155 public $contentParts = array();
156
157 // Content of TYPO3 page
158 /**
159 * @todo Define visibility
160 */
161 public $content_md5h = '';
162
163 /**
164 * @todo Define visibility
165 */
166 public $internal_log = array();
167
168 // Internal log
169 /**
170 * @todo Define visibility
171 */
172 public $indexExternalUrl_content = '';
173
174 /**
175 * @todo Define visibility
176 */
177 public $cHashParams = array();
178
179 // cHashparams array
180 /**
181 * @todo Define visibility
182 */
183 public $freqRange = 32000;
184
185 /**
186 * @todo Define visibility
187 */
188 public $freqMax = 0.1;
189
190 /**
191 * @todo Define visibility
192 */
193 public $enableMetaphoneSearch = FALSE;
194
195 /**
196 * @todo Define visibility
197 */
198 public $storeMetaphoneInfoAsWords;
199
200 /**
201 * @todo Define visibility
202 */
203 public $metaphoneContent = '';
204
205 // Objects:
206 /**
207 * Charset class object
208 *
209 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
210 * @todo Define visibility
211 */
212 public $csObj;
213
214 /**
215 * Metaphone object, if any
216 *
217 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
218 * @todo Define visibility
219 */
220 public $metaphoneObj;
221
222 /**
223 * Lexer object for word splitting
224 *
225 * @var \TYPO3\CMS\IndexedSearch\Lexer
226 * @todo Define visibility
227 */
228 public $lexerObj;
229
230 /**
231 * @todo Define visibility
232 */
233 public $flagBitMask;
234
235 /**
236 * Parent Object (TSFE) Initialization
237 *
238 * @param object Parent Object (frontend TSFE object), passed by reference
239 * @return void
240 * @todo Define visibility
241 */
242 public function hook_indexContent(&$pObj) {
243 // Indexer configuration from Extension Manager interface:
244 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
245 // Crawler activation:
246 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
247 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
248 // Setting simple log message:
249 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
250 // Setting variables:
251 $this->crawlerActive = TRUE;
252 // Crawler active flag
253 $this->forceIndexing = TRUE;
254 }
255 // Determine if page should be indexed, and if so, configure and initialize indexer
256 if ($pObj->config['config']['index_enable']) {
257 $this->log_push('Index page', '');
258 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
259 if (!$pObj->page['no_search']) {
260 if (!$pObj->no_cache) {
261 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
262 // Setting up internal configuration from config array:
263 $this->conf = array();
264 // Information about page for which the indexing takes place
265 $this->conf['id'] = $pObj->id;
266 // Page id
267 $this->conf['type'] = $pObj->type;
268 // Page type
269 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
270 // sys_language UID of the language of the indexing.
271 $this->conf['MP'] = $pObj->MP;
272 // MP variable, if any (Mount Points)
273 $this->conf['gr_list'] = $pObj->gr_list;
274 // Group list
275 $this->conf['cHash'] = $pObj->cHash;
276 // cHash string for additional parameters
277 $this->conf['cHash_array'] = $pObj->cHash_array;
278 // Array of the additional parameters
279 $this->conf['crdate'] = $pObj->page['crdate'];
280 // The creation date of the TYPO3 page
281 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
282 // reg1 of the caching table. Not known what practical use this has.
283 // Root line uids
284 $this->conf['rootline_uids'] = array();
285 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
286 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
287 }
288 // Content of page:
289 $this->conf['content'] = $pObj->content;
290 // Content string (HTML of TYPO3 page)
291 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
292 // Alternative title for indexing
293 $this->conf['metaCharset'] = $pObj->metaCharset;
294 // Character set of content (will be converted to utf-8 during indexing)
295 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];
296 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
297 // Configuration of behavior:
298 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
299 // Whether to index external documents like PDF, DOC etc. (if possible)
300 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
301 // Length of description text (max 250, default 200)
302 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
303 // Set to zero:
304 $this->conf['recordUid'] = 0;
305 $this->conf['freeIndexUid'] = 0;
306 $this->conf['freeIndexSetId'] = 0;
307 // Init and start indexing:
308 $this->init();
309 $this->indexTypo3PageContent();
310 } else {
311 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
312 }
313 } else {
314 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
315 }
316 } else {
317 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
318 }
319 } else {
320 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
321 }
322 $this->log_pull();
323 }
324 }
325
326 /****************************
327 *
328 * Backend API
329 *
330 ****************************/
331 /**
332 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
333 *
334 * @param integer The page uid, &id=
335 * @param integer The page type, &type=
336 * @param integer sys_language uid, typically &L=
337 * @param string The MP variable (Mount Points), &MP=
338 * @param array Rootline array of only UIDs.
339 * @param array Array of GET variables to register with this indexing
340 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
341 * @return void
342 * @todo Define visibility
343 */
344 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = FALSE) {
345 // Setting up internal configuration from config array:
346 $this->conf = array();
347 // Information about page for which the indexing takes place
348 $this->conf['id'] = $id;
349 // Page id (integer)
350 $this->conf['type'] = $type;
351 // Page type (integer)
352 $this->conf['sys_language_uid'] = $sys_language_uid;
353 // sys_language UID of the language of the indexing (integer)
354 $this->conf['MP'] = $MP;
355 // MP variable, if any (Mount Points) (string)
356 $this->conf['gr_list'] = '0,-1';
357 // Group list (hardcoded for now...)
358 // cHash values:
359 if ($createCHash) {
360 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
361 $cacheHash = GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\CacheHashCalculator');
362 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
363 } else {
364 $this->conf['cHash'] = '';
365 }
366 // cHash string for additional parameters
367 $this->conf['cHash_array'] = $cHash_array;
368 // Array of the additional parameters
369 // Set to defaults
370 $this->conf['freeIndexUid'] = 0;
371 $this->conf['freeIndexSetId'] = 0;
372 $this->conf['page_cache_reg1'] = '';
373 // Root line uids
374 $this->conf['rootline_uids'] = $uidRL;
375 // Configuration of behavior:
376 $this->conf['index_externals'] = 1;
377 // Whether to index external documents like PDF, DOC etc. (if possible)
378 $this->conf['index_descrLgd'] = 200;
379 // Length of description text (max 250, default 200)
380 $this->conf['index_metatags'] = TRUE;
381 // Whether to index document keywords and description (if present)
382 // Init and start indexing:
383 $this->init();
384 }
385
386 /**
387 * Sets the free-index uid. Can be called right after backend_initIndexer()
388 *
389 * @param integer Free index UID
390 * @param integer Set id - an integer identifying the "set" of indexing operations.
391 * @return void
392 * @todo Define visibility
393 */
394 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0) {
395 $this->conf['freeIndexUid'] = $freeIndexUid;
396 $this->conf['freeIndexSetId'] = $freeIndexSetId;
397 }
398
399 /**
400 * Indexing records as the content of a TYPO3 page.
401 *
402 * @param string Title equivalent
403 * @param string Keywords equivalent
404 * @param string Description equivalent
405 * @param string The main content to index
406 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
407 * @param integer Last modification time, in seconds
408 * @param integer The creation date of the content, in seconds
409 * @param integer The record UID that the content comes from (for registration with the indexed rows)
410 * @return void
411 * @todo Define visibility
412 */
413 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0) {
414 // Content of page:
415 $this->conf['mtime'] = $mtime;
416 // Most recent modification time (seconds) of the content
417 $this->conf['crdate'] = $crdate;
418 // The creation date of the TYPO3 content
419 $this->conf['recordUid'] = $recordUid;
420 // UID of the record, if applicable
421 // Construct fake HTML for parsing:
422 $this->conf['content'] = '
423 <html>
424 <head>
425 <title>' . htmlspecialchars($title) . '</title>
426 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
427 <meta name="description" content="' . htmlspecialchars($description) . '" />
428 </head>
429 <body>
430 ' . htmlspecialchars($content) . '
431 </body>
432 </html>';
433 // Content string (HTML of TYPO3 page)
434 // Initializing charset:
435 $this->conf['metaCharset'] = $charset;
436 // Character set of content (will be converted to utf-8 during indexing)
437 $this->conf['indexedDocTitle'] = '';
438 // Alternative title for indexing
439 // Index content as if it was a TYPO3 page:
440 $this->indexTypo3PageContent();
441 }
442
443 /********************************
444 *
445 * Initialization
446 *
447 *******************************/
448 /**
449 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
450 *
451 * @return void
452 * @todo Define visibility
453 */
454 public function init() {
455 global $TYPO3_CONF_VARS;
456 // Initializing:
457 $this->cHashParams = $this->conf['cHash_array'];
458 if (is_array($this->cHashParams) && count($this->cHashParams)) {
459 if ($this->conf['cHash']) {
460 // Add this so that URL's come out right...
461 $this->cHashParams['cHash'] = $this->conf['cHash'];
462 }
463 unset($this->cHashParams['encryptionKey']);
464 }
465 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
466 $this->setT3Hashes();
467 // Indexer configuration from Extension Manager interface:
468 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
469 $this->tstamp_minAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
470 $this->tstamp_maxAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
471 $this->maxExternalFiles = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
472 $this->flagBitMask = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
473 // Workaround: If the extension configuration was not updated yet, the value is not existing
474 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
475 $this->storeMetaphoneInfoAsWords = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
476 // Initialize external document parsers:
477 // Example configuration, see ext_localconf.php of this file!
478 if ($this->conf['index_externals']) {
479 $this->initializeExternalParsers();
480 }
481 // Initialize lexer (class that deconstructs the text into words):
482 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ? $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
483 $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
484 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
485 // Initialize metaphone hook:
486 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
487 if ($this->enableMetaphoneSearch && $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
488 $this->metaphoneObj = GeneralUtility::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
489 $this->metaphoneObj->pObj = $this;
490 }
491 // Init charset class:
492 $this->csObj = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Charset\\CharsetConverter');
493 }
494
495 /**
496 * Initialize external parsers
497 *
498 * @return void
499 * @access private
500 * @see init()
501 * @todo Define visibility
502 */
503 public function initializeExternalParsers() {
504 global $TYPO3_CONF_VARS;
505 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
506 foreach ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
507 $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
508 $this->external_parsers[$extension]->pObj = $this;
509 // Init parser and if it returns FALSE, unset its entry again:
510 if (!$this->external_parsers[$extension]->initParser($extension)) {
511 unset($this->external_parsers[$extension]);
512 }
513 }
514 }
515 }
516
517 /********************************
518 *
519 * Indexing; TYPO3 pages (HTML content)
520 *
521 *******************************/
522 /**
523 * Start indexing of the TYPO3 page
524 *
525 * @return void
526 * @todo Define visibility
527 */
528 public function indexTypo3PageContent() {
529 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
530 $is_grlist = $this->is_grlist_set($this->hash['phash']);
531 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
532 // Setting message:
533 if ($this->forceIndexing) {
534 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
535 } elseif ($check > 0) {
536 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
537 } else {
538 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
539 }
540 // Divide into title,keywords,description and body:
541 $this->log_push('Split content', '');
542 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
543 if ($this->conf['indexedDocTitle']) {
544 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
545 }
546 $this->log_pull();
547 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
548 $this->content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
549 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
550 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
551 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
552 $checkCHash = $this->checkContentHash();
553 if (!is_array($checkCHash) || $check === 1) {
554 $Pstart = GeneralUtility::milliseconds();
555 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
556 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
557 $this->log_pull();
558 // Splitting words
559 $this->log_push('Extract words from content', '');
560 $splitInWords = $this->processWordsInArrays($this->contentParts);
561 $this->log_pull();
562 // Analyse the indexed words.
563 $this->log_push('Analyse the extracted words', '');
564 $indexArr = $this->indexAnalyze($splitInWords);
565 $this->log_pull();
566 // Submitting page (phash) record
567 $this->log_push('Submitting page', '');
568 $this->submitPage();
569 $this->log_pull();
570 // Check words and submit to word list if not there
571 $this->log_push('Check word list and submit words', '');
572 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
573 $this->checkWordList($indexArr);
574 $this->submitWords($indexArr, $this->hash['phash']);
575 }
576 $this->log_pull();
577 // Set parsetime
578 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
579 // Checking external files if configured for.
580 $this->log_push('Checking external files', '');
581 if ($this->conf['index_externals']) {
582 $this->extractLinks($this->conf['content']);
583 }
584 $this->log_pull();
585 } else {
586 // Update the timestamp
587 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
588 $this->updateSetId($this->hash['phash']);
589 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
590 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
591 $this->updateRootline();
592 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
593 }
594 } else {
595 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
596 }
597 }
598
599 /**
600 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
601 *
602 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
603 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
604 * @see splitRegularContent()
605 * @todo Define visibility
606 */
607 public function splitHTMLContent($content) {
608 // divide head from body ( u-ouh :) )
609 $contentArr = $this->defaultContentArray;
610 $contentArr['body'] = stristr($content, '<body');
611 $headPart = substr($content, 0, -strlen($contentArr['body']));
612 // get title
613 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
614 $titleParts = explode(':', $contentArr['title'], 2);
615 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
616 // get keywords and description metatags
617 if ($this->conf['index_metatags']) {
618 $meta = array();
619 $i = 0;
620 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
621 $i++;
622 }
623 // TODO The code below stops at first unset tag. Is that correct?
624 for ($i = 0; isset($meta[$i]); $i++) {
625 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
626 if (stristr($meta[$i]['name'], 'keywords')) {
627 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
628 }
629 if (stristr($meta[$i]['name'], 'description')) {
630 $contentArr['description'] .= ',' . $meta[$i]['content'];
631 }
632 }
633 }
634 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
635 $this->typoSearchTags($contentArr['body']);
636 // Get rid of unwanted sections (ie. scripting and style stuff) in body
637 $tagList = explode(',', $this->excludeSections);
638 foreach ($tagList as $tag) {
639 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
640
641 }
642 }
643 // remove tags, but first make sure we don't concatenate words by doing it
644 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
645 $contentArr['body'] = trim(strip_tags($contentArr['body']));
646 $contentArr['keywords'] = trim($contentArr['keywords']);
647 $contentArr['description'] = trim($contentArr['description']);
648 // Return array
649 return $contentArr;
650 }
651
652 /**
653 * Extract the charset value from HTML meta tag.
654 *
655 * @param string HTML content
656 * @return string The charset value if found.
657 * @todo Define visibility
658 */
659 public function getHTMLcharset($content) {
660 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
661 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
662 return $reg2[1];
663 }
664 }
665 }
666
667 /**
668 * Converts a HTML document to utf-8
669 *
670 * @param string HTML content, any charset
671 * @param string Optional charset (otherwise extracted from HTML)
672 * @return string Converted HTML
673 * @todo Define visibility
674 */
675 public function convertHTMLToUtf8($content, $charset = '') {
676 // Find charset:
677 $charset = $charset ? $charset : $this->getHTMLcharset($content);
678 $charset = $this->csObj->parse_charset($charset);
679 // Convert charset:
680 if ($charset && $charset !== 'utf-8') {
681 $content = $this->csObj->utf8_encode($content, $charset);
682 }
683 // Convert entities, assuming document is now UTF-8:
684 $content = $this->csObj->entities_to_utf8($content, TRUE);
685 return $content;
686 }
687
688 /**
689 * Finds first occurence of embracing tags and returns the embraced content and the original string with
690 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
691 * <title> of document or removing <script>-sections
692 *
693 * @param string String to search in
694 * @param string Tag name, eg. "script
695 * @param string Passed by reference: Content inside found tag
696 * @param string Passed by reference: Content after found tag
697 * @param string Passed by reference: Attributes of the found tag.
698 * @return boolean Returns FALSE if tag was not found, otherwise TRUE.
699 * @todo Define visibility
700 */
701 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
702 $endTag = '</' . $tagName . '>';
703 $startTag = '<' . $tagName;
704 // stristr used because we want a case-insensitive search for the tag.
705 $isTagInText = stristr($string, $startTag);
706 // if the tag was not found, return FALSE
707 if (!$isTagInText) {
708 return FALSE;
709 }
710 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
711 $afterTagInText = stristr($isTagInText, $endTag);
712 if ($afterTagInText) {
713 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
714 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
715 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
716 } else {
717 $tagContent = '';
718 $stringAfter = $isTagInText;
719 }
720 return TRUE;
721 }
722
723 /**
724 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
725 *
726 * @param string HTML Content, passed by reference
727 * @return boolean Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
728 * @todo Define visibility
729 */
730 public function typoSearchTags(&$body) {
731 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
732 if (count($expBody) > 1) {
733 $body = '';
734 foreach ($expBody as $val) {
735 $part = explode('-->', $val, 2);
736 if (trim($part[0]) == 'begin') {
737 $body .= $part[1];
738 $prev = '';
739 } elseif (trim($part[0]) == 'end') {
740 $body .= $prev;
741 } else {
742 $prev = $val;
743 }
744 }
745 return TRUE;
746 } else {
747 return FALSE;
748 }
749 }
750
751 /**
752 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
753 *
754 * @param string HTML content
755 * @return void
756 * @todo Define visibility
757 */
758 public function extractLinks($content) {
759 // Get links:
760 $list = $this->extractHyperLinks($content);
761 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
762 $this->includeCrawlerClass();
763 $crawler = GeneralUtility::makeInstance('tx_crawler_lib');
764 }
765 // Traverse links:
766 foreach ($list as $linkInfo) {
767 // Decode entities:
768 if ($linkInfo['localPath']) {
769 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
770 $linkSource = GeneralUtility::htmlspecialchars_decode($linkInfo['localPath']);
771 } else {
772 $linkSource = GeneralUtility::htmlspecialchars_decode($linkInfo['href']);
773 }
774 // Parse URL:
775 $qParts = parse_url($linkSource);
776 // Check for jumpurl (TYPO3 specific thing...)
777 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
778 parse_str($qParts['query'], $getP);
779 $linkSource = $getP['jumpurl'];
780 $qParts = parse_url($linkSource);
781 }
782 if (!$linkInfo['localPath'] && $qParts['scheme']) {
783 if ($this->indexerConfig['indexExternalURLs']) {
784 // Index external URL (http or otherwise)
785 $this->indexExternalUrl($linkSource);
786 }
787 } elseif (!$qParts['query']) {
788 $linkSource = urldecode($linkSource);
789 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
790 $localFile = $linkSource;
791 } else {
792 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
793 }
794 if ($localFile && @is_file($localFile)) {
795 // Index local file:
796 if ($linkInfo['localPath']) {
797 $fI = pathinfo($linkSource);
798 $ext = strtolower($fI['extension']);
799 if (is_object($crawler)) {
800 $params = array(
801 'document' => $linkSource,
802 'alturl' => $linkInfo['href'],
803 'conf' => $this->conf
804 );
805 unset($params['conf']['content']);
806 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
807 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
808 } else {
809 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
810 }
811 } else {
812 if (is_object($crawler)) {
813 $params = array(
814 'document' => $linkSource,
815 'conf' => $this->conf
816 );
817 unset($params['conf']['content']);
818 $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
819 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
820 } else {
821 $this->indexRegularDocument($linkSource);
822 }
823 }
824 }
825 }
826 }
827 }
828
829 /**
830 * Extracts all links to external documents from the HTML content string
831 *
832 * @param string $html
833 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
834 * @see extractLinks()
835 * @todo Define visibility
836 */
837 public function extractHyperLinks($html) {
838 $htmlParser = GeneralUtility::makeInstance('TYPO3\CMS\Core\Html\HtmlParser');
839 $htmlParts = $htmlParser->splitTags('a', $html);
840 $hyperLinksData = array();
841 foreach ($htmlParts as $index => $tagData) {
842 if ($index % 2 !== 0) {
843 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
844 $firstTagName = $htmlParser->getFirstTagName($tagData);
845 if (strtolower($firstTagName) == 'a') {
846 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
847 $hyperLinksData[] = array(
848 'tag' => $tagData,
849 'href' => $tagAttributes[0]['href'],
850 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
851 );
852 }
853 }
854 }
855 }
856 return $hyperLinksData;
857 }
858
859 /**
860 * Extracts the "base href" from content string.
861 *
862 * @param string Content to analyze
863 * @return string The base href or an empty string if not found
864 */
865 public function extractBaseHref($html) {
866 $href = '';
867 $htmlParser = GeneralUtility::makeInstance('TYPO3\CMS\Core\Html\HtmlParser');
868 $htmlParts = $htmlParser->splitTags('base', $html);
869 foreach ($htmlParts as $index => $tagData) {
870 if ($index % 2 !== 0) {
871 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
872 $firstTagName = $htmlParser->getFirstTagName($tagData);
873 if (strtolower($firstTagName) == 'base') {
874 $href = $tagAttributes[0]['href'];
875 if ($href) {
876 break;
877 }
878 }
879 }
880 }
881 return $href;
882 }
883
884 /******************************************
885 *
886 * Indexing; external URL
887 *
888 ******************************************/
889 /**
890 * Index External URLs HTML content
891 *
892 * @param string URL, eg. "http://typo3.org/
893 * @return void
894 * @see indexRegularDocument()
895 * @todo Define visibility
896 */
897 public function indexExternalUrl($externalUrl) {
898 // Parse External URL:
899 $qParts = parse_url($externalUrl);
900 $fI = pathinfo($qParts['path']);
901 $ext = strtolower($fI['extension']);
902 // Get headers:
903 $urlHeaders = $this->getUrlHeaders($externalUrl);
904 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
905 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
906 if (strlen($content)) {
907 // Create temporary file:
908 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
909 if ($tmpFile) {
910 GeneralUtility::writeFile($tmpFile, $content);
911 // Index that file:
912 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
913 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
914 unlink($tmpFile);
915 }
916 }
917 }
918 }
919
920 /**
921 * Getting HTTP request headers of URL
922 *
923 * @param string The URL
924 * @param integer Timeout (seconds?)
925 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
926 * @todo Define visibility
927 */
928 public function getUrlHeaders($url) {
929 // Try to get the headers only
930 $content = GeneralUtility::getUrl($url, 2);
931 if (strlen($content)) {
932 // Compile headers:
933 $headers = GeneralUtility::trimExplode(LF, $content, TRUE);
934 $retVal = array();
935 foreach ($headers as $line) {
936 if (!strlen(trim($line))) {
937 break;
938 }
939 list($headKey, $headValue) = explode(':', $line, 2);
940 $retVal[$headKey] = $headValue;
941 }
942 return $retVal;
943 }
944 }
945
946 /**
947 * Checks if the file is local
948 *
949 * @param $sourcePath
950 * @return string Absolute path to file if file is local, else empty string
951 */
952 protected function createLocalPath($sourcePath) {
953 $localPath = '';
954 static $pathFunctions = array(
955 'createLocalPathFromT3vars',
956 'createLocalPathUsingAbsRefPrefix',
957 'createLocalPathUsingDomainURL',
958 'createLocalPathFromAbsoluteURL',
959 'createLocalPathFromRelativeURL'
960 );
961 foreach ($pathFunctions as $functionName) {
962 $localPath = $this->{$functionName}($sourcePath);
963 if ($localPath != '') {
964 break;
965 }
966 }
967 return $localPath;
968 }
969
970 /**
971 * Attempts to create a local file path from T3VARs. This is useful for
972 * various download extensions that hide actual file name but still want the
973 * file to be indexed.
974 *
975 * @param string $sourcePath
976 * @return string
977 */
978 protected function createLocalPathFromT3vars($sourcePath) {
979 $localPath = '';
980 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
981 if (is_array($indexLocalFiles)) {
982 $md5 = GeneralUtility::shortMD5($sourcePath);
983 // Note: not using self::isAllowedLocalFile here because this method
984 // is allowed to index files outside of the web site (for example,
985 // protected downloads)
986 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
987 $localPath = $indexLocalFiles[$md5];
988 }
989 }
990 return $localPath;
991 }
992
993 /**
994 * Attempts to create a local file path by matching a current request URL.
995 *
996 * @param string $sourcePath
997 * @return string
998 */
999 protected function createLocalPathUsingDomainURL($sourcePath) {
1000 $localPath = '';
1001 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1002 $baseURLLength = strlen($baseURL);
1003 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1004 $sourcePath = substr($sourcePath, $baseURLLength);
1005 $localPath = PATH_site . $sourcePath;
1006 if (!self::isAllowedLocalFile($localPath)) {
1007 $localPath = '';
1008 }
1009 }
1010 return $localPath;
1011 }
1012
1013 /**
1014 * Attempts to create a local file path by matching absRefPrefix. This
1015 * requires TSFE. If TSFE is missing, this function does nothing.
1016 *
1017 * @param string $sourcePath
1018 * @return string
1019 */
1020 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
1021 $localPath = '';
1022 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1023 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1024 $absRefPrefixLength = strlen($absRefPrefix);
1025 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1026 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1027 $localPath = PATH_site . $sourcePath;
1028 if (!self::isAllowedLocalFile($localPath)) {
1029 $localPath = '';
1030 }
1031 }
1032 }
1033 return $localPath;
1034 }
1035
1036 /**
1037 * Attempts to create a local file path from the absolute URL without
1038 * schema.
1039 *
1040 * @param string $sourcePath
1041 * @return string
1042 */
1043 protected function createLocalPathFromAbsoluteURL($sourcePath) {
1044 $localPath = '';
1045 if ($sourcePath[0] == '/') {
1046 $sourcePath = substr($sourcePath, 1);
1047 $localPath = PATH_site . $sourcePath;
1048 if (!self::isAllowedLocalFile($localPath)) {
1049 $localPath = '';
1050 }
1051 }
1052 return $localPath;
1053 }
1054
1055 /**
1056 * Attempts to create a local file path from the relative URL.
1057 *
1058 * @param string $sourcePath
1059 * @return string
1060 */
1061 protected function createLocalPathFromRelativeURL($sourcePath) {
1062 $localPath = '';
1063 if (self::isRelativeURL($sourcePath)) {
1064 $localPath = PATH_site . $sourcePath;
1065 if (!self::isAllowedLocalFile($localPath)) {
1066 $localPath = '';
1067 }
1068 }
1069 return $localPath;
1070 }
1071
1072 /**
1073 * Checks if URL is relative.
1074 *
1075 * @param string $url
1076 * @return boolean
1077 */
1078 static protected function isRelativeURL($url) {
1079 $urlParts = @parse_url($url);
1080 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1081 }
1082
1083 /**
1084 * Checks if the path points to the file inside the web site
1085 *
1086 * @param string $filePath
1087 * @return boolean
1088 */
1089 static protected function isAllowedLocalFile($filePath) {
1090 $filePath = GeneralUtility::resolveBackPath($filePath);
1091 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1092 $isFile = is_file($filePath);
1093 return $insideWebPath && $isFile;
1094 }
1095
1096 /******************************************
1097 *
1098 * Indexing; external files (PDF, DOC, etc)
1099 *
1100 ******************************************/
1101 /**
1102 * Indexing a regular document given as $file (relative to PATH_site, local file)
1103 *
1104 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1105 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1106 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1107 * @param string File extension for temporary file.
1108 * @return void
1109 * @todo Define visibility
1110 */
1111 public function indexRegularDocument($file, $force = FALSE, $contentTmpFile = '', $altExtension = '') {
1112 // Init
1113 $fI = pathinfo($file);
1114 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
1115 // Create abs-path:
1116 if (!$contentTmpFile) {
1117 if (!GeneralUtility::isAbsPath($file)) {
1118 // Relative, prepend PATH_site:
1119 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1120 } else {
1121 // Absolute, pass-through:
1122 $absFile = $file;
1123 }
1124 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1125 } else {
1126 $absFile = $contentTmpFile;
1127 }
1128 // Indexing the document:
1129 if ($absFile && @is_file($absFile)) {
1130 if ($this->external_parsers[$ext]) {
1131 $mtime = filemtime($absFile);
1132 $cParts = $this->fileContentParts($ext, $absFile);
1133 foreach ($cParts as $cPKey) {
1134 $this->internal_log = array();
1135 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1136 $Pstart = GeneralUtility::milliseconds();
1137 $subinfo = array('key' => $cPKey);
1138 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1139 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1140 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1141 if ($check > 0 || $force) {
1142 if ($check > 0) {
1143 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1144 } else {
1145 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1146 }
1147 // Check external file counter:
1148 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1149 // Divide into title,keywords,description and body:
1150 $this->log_push('Split content', '');
1151 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1152 $this->log_pull();
1153 if (is_array($contentParts)) {
1154 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1155 $content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1156 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1157 // Increment counter:
1158 $this->externalFileCounter++;
1159 // Splitting words
1160 $this->log_push('Extract words from content', '');
1161 $splitInWords = $this->processWordsInArrays($contentParts);
1162 $this->log_pull();
1163 // Analyse the indexed words.
1164 $this->log_push('Analyse the extracted words', '');
1165 $indexArr = $this->indexAnalyze($splitInWords);
1166 $this->log_pull();
1167 // Submitting page (phash) record
1168 $this->log_push('Submitting page', '');
1169 $size = filesize($absFile);
1170 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1171 $ctime = filemtime($absFile);
1172 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts);
1173 $this->log_pull();
1174 // Check words and submit to word list if not there
1175 $this->log_push('Check word list and submit words', '');
1176 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1177 $this->checkWordList($indexArr);
1178 $this->submitWords($indexArr, $phash_arr['phash']);
1179 }
1180 $this->log_pull();
1181 // Set parsetime
1182 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1183 } else {
1184 // Update the timestamp
1185 $this->updateTstamp($phash_arr['phash'], $mtime);
1186 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1187 }
1188 } else {
1189 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1190 }
1191 } else {
1192 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1193 }
1194 } else {
1195 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1196 }
1197 // Checking and setting sections:
1198 $this->submitFile_section($phash_arr['phash']);
1199 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1200 $this->log_pull();
1201 }
1202 } else {
1203 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1204 }
1205 } else {
1206 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1207 }
1208 }
1209
1210 /**
1211 * Reads the content of an external file being indexed.
1212 * The content from the external parser MUST be returned in utf-8!
1213 *
1214 * @param string File extension, eg. "pdf", "doc" etc.
1215 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1216 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1217 * @return array Standard content array (title, description, keywords, body keys)
1218 * @todo Define visibility
1219 */
1220 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1221 $contentArray = NULL;
1222 // Consult relevant external document parser:
1223 if (is_object($this->external_parsers[$fileExtension])) {
1224 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1225 }
1226 return $contentArray;
1227 }
1228
1229 /**
1230 * Creates an array with pointers to divisions of document.
1231 *
1232 * @param string File extension
1233 * @param string Absolute filename (must exist and be validated OK before calling function)
1234 * @return array Array of pointers to sections that the document should be divided into
1235 * @todo Define visibility
1236 */
1237 public function fileContentParts($ext, $absFile) {
1238 $cParts = array(0);
1239 // Consult relevant external document parser:
1240 if (is_object($this->external_parsers[$ext])) {
1241 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1242 }
1243 return $cParts;
1244 }
1245
1246 /**
1247 * Splits non-HTML content (from external files for instance)
1248 *
1249 * @param string Input content (non-HTML) to index.
1250 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1251 * @see splitHTMLContent()
1252 * @todo Define visibility
1253 */
1254 public function splitRegularContent($content) {
1255 $contentArr = $this->defaultContentArray;
1256 $contentArr['body'] = $content;
1257 return $contentArr;
1258 }
1259
1260 /**********************************
1261 *
1262 * Analysing content, Extracting words
1263 *
1264 **********************************/
1265 /**
1266 * Convert character set and HTML entities in the value of input content array keys
1267 *
1268 * @param array Standard content array
1269 * @param string Charset of the input content (converted to utf-8)
1270 * @return void
1271 * @todo Define visibility
1272 */
1273 public function charsetEntity2utf8(&$contentArr, $charset) {
1274 // Convert charset if necessary
1275 foreach ($contentArr as $key => $value) {
1276 if (strlen($contentArr[$key])) {
1277 if ($charset !== 'utf-8') {
1278 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1279 }
1280 // decode all numeric / html-entities in the string to real characters:
1281 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1282 }
1283 }
1284 }
1285
1286 /**
1287 * Processing words in the array from split*Content -functions
1288 *
1289 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1290 * @return array Content input array modified so each key is not a unique array of words
1291 * @todo Define visibility
1292 */
1293 public function processWordsInArrays($contentArr) {
1294 // split all parts to words
1295 foreach ($contentArr as $key => $value) {
1296 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1297 }
1298 // For title, keywords, and description we don't want duplicates:
1299 $contentArr['title'] = array_unique($contentArr['title']);
1300 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1301 $contentArr['description'] = array_unique($contentArr['description']);
1302 // Return modified array:
1303 return $contentArr;
1304 }
1305
1306 /**
1307 * Extracts the sample description text from the content array.
1308 *
1309 * @param array Content array
1310 * @return string Description string
1311 * @todo Define visibility
1312 */
1313 public function bodyDescription($contentArr) {
1314 // Setting description
1315 $maxL = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1316 if ($maxL) {
1317 $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1318 // Shorten the string:
1319 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1320 }
1321 return $bodyDescription;
1322 }
1323
1324 /**
1325 * Analyzes content to use for indexing,
1326 *
1327 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1328 * @return array Index Array (whatever that is...)
1329 * @todo Define visibility
1330 */
1331 public function indexAnalyze($content) {
1332 $indexArr = array();
1333 $counter = 0;
1334 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1335 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1336 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1337 $this->analyzeBody($indexArr, $content);
1338 return $indexArr;
1339 }
1340
1341 /**
1342 * Calculates relevant information for headercontent
1343 *
1344 * @param array Index array, passed by reference
1345 * @param array Standard content array
1346 * @param string Key from standard content array
1347 * @param integer Bit-wise priority to type
1348 * @return void
1349 * @todo Define visibility
1350 */
1351 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1352 foreach ($content[$key] as $val) {
1353 $val = substr($val, 0, 60);
1354 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1355 if (!isset($retArr[$val])) {
1356 // Word ID (wid)
1357 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1358 // Metaphone value is also 60 only chars long
1359 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1360 $retArr[$val]['metaphone'] = $metaphone;
1361 }
1362 // Build metaphone fulltext string (can be used for fulltext indexing)
1363 if ($this->storeMetaphoneInfoAsWords) {
1364 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1365 }
1366 // Priority used for flagBitMask feature (see extension configuration)
1367 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1368 // Increase number of occurences
1369 $retArr[$val]['count']++;
1370 $this->wordcount++;
1371 }
1372 }
1373
1374 /**
1375 * Calculates relevant information for bodycontent
1376 *
1377 * @param array Index array, passed by reference
1378 * @param array Standard content array
1379 * @return void
1380 * @todo Define visibility
1381 */
1382 public function analyzeBody(&$retArr, $content) {
1383 foreach ($content['body'] as $key => $val) {
1384 $val = substr($val, 0, 60);
1385 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1386 if (!isset($retArr[$val])) {
1387 // First occurence (used for ranking results)
1388 $retArr[$val]['first'] = $key;
1389 // Word ID (wid)
1390 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1391 // Metaphone value is also only 60 chars long
1392 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1393 $retArr[$val]['metaphone'] = $metaphone;
1394 }
1395 // Build metaphone fulltext string (can be used for fulltext indexing)
1396 if ($this->storeMetaphoneInfoAsWords) {
1397 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1398 }
1399 // Increase number of occurences
1400 $retArr[$val]['count']++;
1401 $this->wordcount++;
1402 }
1403 }
1404
1405 /**
1406 * Creating metaphone based hash from input word
1407 *
1408 * @param string Word to convert
1409 * @param boolean If set, returns the raw metaphone value (not hashed)
1410 * @return mixed Metaphone hash integer (or raw value, string)
1411 * @todo Define visibility
1412 */
1413 public function metaphone($word, $returnRawMetaphoneValue = FALSE) {
1414 if (is_object($this->metaphoneObj)) {
1415 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1416 } else {
1417 // Use native PHP function instead of advanced doubleMetaphone class
1418 $metaphoneRawValue = metaphone($word);
1419 }
1420 if ($returnRawMetaphoneValue) {
1421 $result = $metaphoneRawValue;
1422 } elseif (strlen($metaphoneRawValue)) {
1423 // Create hash and return integer
1424 $result = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($metaphoneRawValue);
1425 } else {
1426 $result = 0;
1427 }
1428 return $result;
1429 }
1430
1431 /********************************
1432 *
1433 * SQL; TYPO3 Pages
1434 *
1435 *******************************/
1436 /**
1437 * Updates db with information about the page (TYPO3 page, not external media)
1438 *
1439 * @return void
1440 * @todo Define visibility
1441 */
1442 public function submitPage() {
1443 // Remove any current data for this phash:
1444 $this->removeOldIndexedPages($this->hash['phash']);
1445 // setting new phash_row
1446 $fields = array(
1447 'phash' => $this->hash['phash'],
1448 'phash_grouping' => $this->hash['phash_grouping'],
1449 'cHashParams' => serialize($this->cHashParams),
1450 'contentHash' => $this->content_md5h,
1451 'data_page_id' => $this->conf['id'],
1452 'data_page_reg1' => $this->conf['page_cache_reg1'],
1453 'data_page_type' => $this->conf['type'],
1454 'data_page_mp' => $this->conf['MP'],
1455 'gr_list' => $this->conf['gr_list'],
1456 'item_type' => 0,
1457 // TYPO3 page
1458 'item_title' => $this->contentParts['title'],
1459 'item_description' => $this->bodyDescription($this->contentParts),
1460 'item_mtime' => $this->conf['mtime'],
1461 'item_size' => strlen($this->conf['content']),
1462 'tstamp' => $GLOBALS['EXEC_TIME'],
1463 'crdate' => $GLOBALS['EXEC_TIME'],
1464 'item_crdate' => $this->conf['crdate'],
1465 // Creation date of page
1466 'sys_language_uid' => $this->conf['sys_language_uid'],
1467 // Sys language uid of the page. Should reflect which language it DOES actually display!
1468 'externalUrl' => 0,
1469 'recordUid' => intval($this->conf['recordUid']),
1470 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1471 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1472 );
1473 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1474 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1475 }
1476 // PROCESSING index_section
1477 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1478 // PROCESSING index_grlist
1479 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1480 // PROCESSING index_fulltext
1481 $fields = array(
1482 'phash' => $this->hash['phash'],
1483 'fulltextdata' => implode(' ', $this->contentParts),
1484 'metaphonedata' => $this->metaphoneContent
1485 );
1486 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1487 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1488 }
1489 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1490 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1491 }
1492 // PROCESSING index_debug
1493 if ($this->indexerConfig['debugMode']) {
1494 $fields = array(
1495 'phash' => $this->hash['phash'],
1496 'debuginfo' => serialize(array(
1497 'cHashParams' => $this->cHashParams,
1498 'external_parsers initialized' => array_keys($this->external_parsers),
1499 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1500 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1501 'logs' => $this->internal_log,
1502 'lexer' => $this->lexerObj->debugString
1503 ))
1504 );
1505 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1506 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1507 }
1508 }
1509 }
1510
1511 /**
1512 * Stores gr_list in the database.
1513 *
1514 * @param integer Search result record phash
1515 * @param integer Actual phash of current content
1516 * @return void
1517 * @see update_grlist()
1518 * @todo Define visibility
1519 */
1520 public function submit_grlist($hash, $phash_x) {
1521 // Setting the gr_list record
1522 $fields = array(
1523 'phash' => $hash,
1524 'phash_x' => $phash_x,
1525 'hash_gr_list' => \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1526 'gr_list' => $this->conf['gr_list']
1527 );
1528 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1529 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1530 }
1531 }
1532
1533 /**
1534 * Stores section
1535 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1536 *
1537 * @param integer phash of TYPO3 parent search result record
1538 * @param integer phash of the file indexation search record
1539 * @return void
1540 * @todo Define visibility
1541 */
1542 public function submit_section($hash, $hash_t3) {
1543 $fields = array(
1544 'phash' => $hash,
1545 'phash_t3' => $hash_t3,
1546 'page_id' => intval($this->conf['id'])
1547 );
1548 $this->getRootLineFields($fields);
1549 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1550 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1551 }
1552 }
1553
1554 /**
1555 * Removes records for the indexed page, $phash
1556 *
1557 * @param integer phash value to flush
1558 * @return void
1559 * @todo Define visibility
1560 */
1561 public function removeOldIndexedPages($phash) {
1562 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1563 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1564 foreach ($tableArray as $table) {
1565 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1566 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1567 }
1568 }
1569 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1570 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1571 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . intval($phash));
1572 }
1573 }
1574
1575 /********************************
1576 *
1577 * SQL; External media
1578 *
1579 *******************************/
1580 /**
1581 * Updates db with information about the file
1582 *
1583 * @param array Array with phash and phash_grouping keys for file
1584 * @param string File name
1585 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1586 * @param string File extension determining the type of media.
1587 * @param integer Modification time of file.
1588 * @param integer Creation time of file.
1589 * @param integer Size of file in bytes
1590 * @param integer Content HASH value.
1591 * @param array Standard content array (using only title and body for a file)
1592 * @return void
1593 * @todo Define visibility
1594 */
1595 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1596 // Find item Type:
1597 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1598 $storeItemType = $storeItemType ? $storeItemType : $ext;
1599 // Remove any current data for this phash:
1600 $this->removeOldIndexedFiles($hash['phash']);
1601 // Split filename:
1602 $fileParts = parse_url($file);
1603 // Setting new
1604 $fields = array(
1605 'phash' => $hash['phash'],
1606 'phash_grouping' => $hash['phash_grouping'],
1607 'cHashParams' => serialize($subinfo),
1608 'contentHash' => $content_md5h,
1609 'data_filename' => $file,
1610 'item_type' => $storeItemType,
1611 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1612 'item_description' => $this->bodyDescription($contentParts),
1613 'item_mtime' => $mtime,
1614 'item_size' => $size,
1615 'item_crdate' => $ctime,
1616 'tstamp' => $GLOBALS['EXEC_TIME'],
1617 'crdate' => $GLOBALS['EXEC_TIME'],
1618 'gr_list' => $this->conf['gr_list'],
1619 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1620 'recordUid' => intval($this->conf['recordUid']),
1621 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1622 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1623 'sys_language_uid' => intval($this->conf['sys_language_uid'])
1624 );
1625 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1626 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1627 }
1628 // PROCESSING index_fulltext
1629 $fields = array(
1630 'phash' => $hash['phash'],
1631 'fulltextdata' => implode(' ', $contentParts),
1632 'metaphonedata' => $this->metaphoneContent
1633 );
1634 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1635 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1636 }
1637 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1638 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1639 }
1640 // PROCESSING index_debug
1641 if ($this->indexerConfig['debugMode']) {
1642 $fields = array(
1643 'phash' => $hash['phash'],
1644 'debuginfo' => serialize(array(
1645 'cHashParams' => $subinfo,
1646 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1647 'logs' => $this->internal_log,
1648 'lexer' => $this->lexerObj->debugString
1649 ))
1650 );
1651 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1652 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1653 }
1654 }
1655 }
1656
1657 /**
1658 * Stores file gr_list for a file IF it does not exist already
1659 *
1660 * @param integer phash value of file
1661 * @return void
1662 * @todo Define visibility
1663 */
1664 public function submitFile_grlist($hash) {
1665 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1666 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1667 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($hash) . ' AND (hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1668 if ($count == 0) {
1669 $this->submit_grlist($hash, $hash);
1670 }
1671 }
1672 }
1673
1674 /**
1675 * Stores file section for a file IF it does not exist
1676 *
1677 * @param integer phash value of file
1678 * @return void
1679 * @todo Define visibility
1680 */
1681 public function submitFile_section($hash) {
1682 // Testing if there is already a section
1683 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1684 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . intval($hash) . ' AND page_id=' . intval($this->conf['id']));
1685 if ($count == 0) {
1686 $this->submit_section($hash, $this->hash['phash']);
1687 }
1688 }
1689 }
1690
1691 /**
1692 * Removes records for the indexed page, $phash
1693 *
1694 * @param integer phash value to flush
1695 * @return void
1696 * @todo Define visibility
1697 */
1698 public function removeOldIndexedFiles($phash) {
1699 // Removing old registrations for tables.
1700 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1701 foreach ($tableArray as $table) {
1702 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1703 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1704 }
1705 }
1706 }
1707
1708 /********************************
1709 *
1710 * SQL Helper functions
1711 *
1712 *******************************/
1713 /**
1714 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1715 * Return positive integer if the page needs to be indexed
1716 *
1717 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1718 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1719 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1720 * @todo Define visibility
1721 */
1722 public function checkMtimeTstamp($mtime, $phash) {
1723 if (!\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1724 // Not indexed (not in index_phash)
1725 $result = 4;
1726 } else {
1727 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . intval($phash));
1728 // If there was an indexing of the page...:
1729 if ($row) {
1730 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1731 // If max age is exceeded, index the page
1732 // The configured max-age was exceeded for the document and thus it's indexed.
1733 $result = 1;
1734 } else {
1735 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1736 // if minAge is not set or if minAge is exceeded, consider at mtime
1737 if ($mtime) {
1738 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1739 if ($row['item_mtime'] != $mtime) {
1740 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1741 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1742 $result = 2;
1743 } else {
1744 // mtime matched the document, so no changes detected and no content updated
1745 $result = -1;
1746 if ($this->tstamp_maxAge) {
1747 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1748 } else {
1749 $this->updateTstamp($phash);
1750 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1751 }
1752 }
1753 } else {
1754 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1755 $result = 3;
1756 }
1757 } else {
1758 // The minimum age was not exceeded
1759 $result = -2;
1760 }
1761 }
1762 } else {
1763 // Page has never been indexed (is not represented in the index_phash table).
1764 $result = 4;
1765 }
1766 }
1767 return $result;
1768 }
1769
1770 /**
1771 * Check content hash in phash table
1772 *
1773 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1774 * @todo Define visibility
1775 */
1776 public function checkContentHash() {
1777 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1778 $result = TRUE;
1779 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1780 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . intval($this->hash['phash_grouping']) . ' AND contentHash=' . intval($this->content_md5h));
1781 if ($row) {
1782 $result = $row;
1783 }
1784 }
1785 return $result;
1786 }
1787
1788 /**
1789 * Check content hash for external documents
1790 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1791 *
1792 * @param integer phash value to check (phash_grouping)
1793 * @param integer Content hash to check
1794 * @return boolean Returns TRUE if the document needs to be indexed (that is, there was no result)
1795 * @todo Define visibility
1796 */
1797 public function checkExternalDocContentHash($hashGr, $content_md5h) {
1798 $result = TRUE;
1799 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1800 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . intval($hashGr) . ' AND contentHash=' . intval($content_md5h));
1801 $result = $count == 0;
1802 }
1803 return $result;
1804 }
1805
1806 /**
1807 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1808 *
1809 * @param integer Phash integer to test.
1810 * @return boolean
1811 * @todo Define visibility
1812 */
1813 public function is_grlist_set($phash_x) {
1814 $result = FALSE;
1815 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1816 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . intval($phash_x));
1817 $result = $count > 0;
1818 }
1819 return $result;
1820 }
1821
1822 /**
1823 * Check if an grlist-entry for this hash exists and if not so, write one.
1824 *
1825 * @param integer phash of the search result that should be found
1826 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1827 * @return void
1828 * @see submit_grlist()
1829 * @todo Define visibility
1830 */
1831 public function update_grlist($phash, $phash_x) {
1832 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1833 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($phash) . ' AND hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1834 if ($count == 0) {
1835 $this->submit_grlist($phash, $phash_x);
1836 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1837 }
1838 }
1839 }
1840
1841 /**
1842 * Update tstamp for a phash row.
1843 *
1844 * @param integer phash value
1845 * @param integer If set, update the mtime field to this value.
1846 * @return void
1847 * @todo Define visibility
1848 */
1849 public function updateTstamp($phash, $mtime = 0) {
1850 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1851 $updateFields = array(
1852 'tstamp' => $GLOBALS['EXEC_TIME']
1853 );
1854 if ($mtime) {
1855 $updateFields['item_mtime'] = intval($mtime);
1856 }
1857 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1858 }
1859 }
1860
1861 /**
1862 * Update SetID of the index_phash record.
1863 *
1864 * @param integer phash value
1865 * @return void
1866 * @todo Define visibility
1867 */
1868 public function updateSetId($phash) {
1869 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1870 $updateFields = array(
1871 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1872 );
1873 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1874 }
1875 }
1876
1877 /**
1878 * Update parsetime for phash row.
1879 *
1880 * @param integer phash value.
1881 * @param integer Parsetime value to set.
1882 * @return void
1883 * @todo Define visibility
1884 */
1885 public function updateParsetime($phash, $parsetime) {
1886 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1887 $updateFields = array(
1888 'parsetime' => intval($parsetime)
1889 );
1890 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1891 }
1892 }
1893
1894 /**
1895 * Update section rootline for the page
1896 *
1897 * @return void
1898 * @todo Define visibility
1899 */
1900 public function updateRootline() {
1901 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1902 $updateFields = array();
1903 $this->getRootLineFields($updateFields);
1904 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . intval($this->conf['id']), $updateFields);
1905 }
1906 }
1907
1908 /**
1909 * Adding values for root-line fields.
1910 * rl0, rl1 and rl2 are standard. A hook might add more.
1911 *
1912 * @param array Field array, passed by reference
1913 * @return void
1914 * @todo Define visibility
1915 */
1916 public function getRootLineFields(array &$fieldArray) {
1917 $fieldArray['rl0'] = intval($this->conf['rootline_uids'][0]);
1918 $fieldArray['rl1'] = intval($this->conf['rootline_uids'][1]);
1919 $fieldArray['rl2'] = intval($this->conf['rootline_uids'][2]);
1920 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1921 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1922 $fieldArray[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1923 }
1924 }
1925 }
1926
1927 /**
1928 * Removes any indexed pages with userlogins which has the same contentHash
1929 * NOT USED anywhere inside this class!
1930 *
1931 * @return void
1932 * @todo Define visibility
1933 */
1934 public function removeLoginpagesWithContentHash() {
1935 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash') && \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1936 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1937 A.phash=B.phash
1938 AND A.phash_grouping=' . intval($this->hash['phash_grouping']) . '
1939 AND B.hash_gr_list<>' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . '
1940 AND A.contentHash=' . intval($this->content_md5h));
1941 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1942 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1943 $this->removeOldIndexedPages($row['phash']);
1944 }
1945 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1946 }
1947 }
1948
1949 /**
1950 * Includes the crawler class
1951 *
1952 * @return void
1953 * @todo Define visibility
1954 */
1955 public function includeCrawlerClass() {
1956 GeneralUtility::requireOnce(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php');
1957 }
1958
1959 /********************************
1960 *
1961 * SQL; Submitting words
1962 *
1963 *******************************/
1964 /**
1965 * Adds new words to db
1966 *
1967 * @param array $wordListArray Word List array (where each word has information about position etc).
1968 * @return void
1969 * @todo Define visibility
1970 */
1971 public function checkWordList($wordListArray) {
1972 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1973 if (count($wordListArray)) {
1974 $phashArray = array();
1975 foreach ($wordListArray as $value) {
1976 $phashArray[] = intval($value['hash']);
1977 }
1978 $cwl = implode(',', $phashArray);
1979 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1980 if ($count != count($wordListArray)) {
1981 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1982 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
1983 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1984 unset($wordListArray[$row['baseword']]);
1985 }
1986 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1987 foreach ($wordListArray as $key => $val) {
1988 $insertFields = array(
1989 'wid' => $val['hash'],
1990 'baseword' => $key,
1991 'metaphone' => $val['metaphone']
1992 );
1993 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1994 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1995 }
1996 }
1997 }
1998 }
1999 }
2000
2001 /**
2002 * Submits RELATIONS between words and phash
2003 *
2004 * @param array Word list array
2005 * @param integer phash value
2006 * @return void
2007 * @todo Define visibility
2008 */
2009 public function submitWords($wordList, $phash) {
2010 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_rel')) {
2011 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . intval($phash));
2012 foreach ($wordList as $val) {
2013 $insertFields = array(
2014 'phash' => (integer)$phash,
2015 'wid' => (integer)$val['hash'],
2016 'count' => (integer)$val['count'],
2017 'first' => (integer)$val['first'],
2018 'freq' => $this->freqMap($val['count'] / $this->wordcount),
2019 'flags' => $val['cmp'] & $this->flagBitMask
2020 );
2021 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2022 }
2023 }
2024 }
2025
2026 /**
2027 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2028 * and back.
2029 *
2030 * @param double Frequency
2031 * @return integer Frequency in range.
2032 * @todo Define visibility
2033 */
2034 public function freqMap($freq) {
2035 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2036 if ($freq < 1) {
2037 $newFreq = $freq * $mapFactor;
2038 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2039 } else {
2040 $newFreq = $freq / $mapFactor;
2041 }
2042 return $newFreq;
2043 }
2044
2045 /********************************
2046 *
2047 * Hashing
2048 *
2049 *******************************/
2050 /**
2051 * Get search hash, T3 pages
2052 *
2053 * @return void
2054 * @todo Define visibility
2055 */
2056 public function setT3Hashes() {
2057 // Set main array:
2058 $hArray = array(
2059 'id' => (int) $this->conf['id'],
2060 'type' => (int) $this->conf['type'],
2061 'sys_lang' => (int) $this->conf['sys_language_uid'],
2062 'MP' => (string) $this->conf['MP'],
2063 'cHash' => $this->cHashParams
2064 );
2065 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2066 $this->hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2067 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2068 $hArray['gr_list'] = (string) $this->conf['gr_list'];
2069 $this->hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2070 }
2071
2072 /**
2073 * Get search hash, external files
2074 *
2075 * @param string File name / path which identifies it on the server
2076 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2077 * @return array Array with "phash_grouping" and "phash" inside.
2078 * @todo Define visibility
2079 */
2080 public function setExtHashes($file, $subinfo = array()) {
2081 // Set main array:
2082 $hash = array();
2083 $hArray = array(
2084 'file' => $file
2085 );
2086 // Set grouping hash:
2087 $hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2088 // Add subinfo
2089 $hArray['subinfo'] = $subinfo;
2090 $hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2091 return $hash;
2092 }
2093
2094 /*********************************
2095 *
2096 * Internal logging functions
2097 *
2098 *********************************/
2099 /**
2100 * Push function wrapper for TT logging
2101 *
2102 * @param string Title to set
2103 * @param string Key (?)
2104 * @return void
2105 * @todo Define visibility
2106 */
2107 public function log_push($msg, $key) {
2108 if (is_object($GLOBALS['TT'])) {
2109 $GLOBALS['TT']->push($msg, $key);
2110 }
2111 }
2112
2113 /**
2114 * Pull function wrapper for TT logging
2115 *
2116 * @return void
2117 * @todo Define visibility
2118 */
2119 public function log_pull() {
2120 if (is_object($GLOBALS['TT'])) {
2121 $GLOBALS['TT']->pull();
2122 }
2123 }
2124
2125 /**
2126 * Set log message function wrapper for TT logging
2127 *
2128 * @param string Message to set
2129 * @param integer Error number
2130 * @return void
2131 * @todo Define visibility
2132 */
2133 public function log_setTSlogMessage($msg, $errorNum = 0) {
2134 if (is_object($GLOBALS['TT'])) {
2135 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2136 }
2137 $this->internal_log[] = $msg;
2138 }
2139
2140 /**************************
2141 *
2142 * \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController hooks:
2143 *
2144 **************************/
2145 /**
2146 * Makes sure that keywords are space-separated. This is impotant for their
2147 * proper displaying as a part of fulltext index.
2148 *
2149 * @param string $keywordList
2150 * @return string
2151 * @see http://forge.typo3.org/issues/14959
2152 */
2153 protected function addSpacesToKeywordList($keywordList) {
2154 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2155 return ' ' . implode(', ', $keywords) . ' ';
2156 }
2157
2158 }