[TASK] Update t3lib mentions
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2001-2013 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the TYPO3 project. The TYPO3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 * A copy is found in the textfile GPL.txt and important notices to the license
19 * from the author is found in LICENSE.txt distributed with these scripts.
20 *
21 *
22 * This script is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * This copyright notice MUST APPEAR in all copies of the script!
28 ***************************************************************/
29 /**
30 * This class is a search indexer for TYPO3
31 *
32 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
33 */
34 /**
35 * Indexing class for TYPO3 frontend
36 *
37 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
38 */
39 class Indexer {
40
41 // Messages:
42 /**
43 * @todo Define visibility
44 */
45 public $reasons = array(
46 -1 => 'mtime matched the document, so no changes detected and no content updated',
47 -2 => 'The minimum age was not exceeded',
48 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
49 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
50 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
51 4 => 'Page has never been indexed (is not represented in the index_phash table).'
52 );
53
54 // HTML code blocks to exclude from indexing:
55 /**
56 * @todo Define visibility
57 */
58 public $excludeSections = 'script,style';
59
60 // Supported Extensions for external files:
61 /**
62 * @todo Define visibility
63 */
64 public $external_parsers = array();
65
66 // External parser objects, keys are file extension names. Values are objects with certain methods.
67 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
68 /**
69 * @todo Define visibility
70 */
71 public $defaultGrList = '0,-1';
72
73 // Min/Max times:
74 /**
75 * @todo Define visibility
76 */
77 public $tstamp_maxAge = 0;
78
79 // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
80 /**
81 * @todo Define visibility
82 */
83 public $tstamp_minAge = 0;
84
85 // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
86 /**
87 * @todo Define visibility
88 */
89 public $maxExternalFiles = 0;
90
91 // Max number of external files to index.
92 /**
93 * @todo Define visibility
94 */
95 public $forceIndexing = FALSE;
96
97 // If TRUE, indexing is forced despite of hashes etc.
98 /**
99 * @todo Define visibility
100 */
101 public $crawlerActive = FALSE;
102
103 // Set when crawler is detected (internal)
104 // INTERNALS:
105 /**
106 * @todo Define visibility
107 */
108 public $defaultContentArray = array(
109 'title' => '',
110 'description' => '',
111 'keywords' => '',
112 'body' => ''
113 );
114
115 /**
116 * @todo Define visibility
117 */
118 public $wordcount = 0;
119
120 /**
121 * @todo Define visibility
122 */
123 public $externalFileCounter = 0;
124
125 /**
126 * @todo Define visibility
127 */
128 public $conf = array();
129
130 // Configuration set internally (see init functions for required keys and their meaning)
131 /**
132 * @todo Define visibility
133 */
134 public $indexerConfig = array();
135
136 // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
137 /**
138 * @todo Define visibility
139 */
140 public $hash = array();
141
142 // Hash array, contains phash and phash_grouping
143 /**
144 * @todo Define visibility
145 */
146 public $file_phash_arr = array();
147
148 // Hash array for files
149 /**
150 * @todo Define visibility
151 */
152 public $contentParts = array();
153
154 // Content of TYPO3 page
155 /**
156 * @todo Define visibility
157 */
158 public $content_md5h = '';
159
160 /**
161 * @todo Define visibility
162 */
163 public $internal_log = array();
164
165 // Internal log
166 /**
167 * @todo Define visibility
168 */
169 public $indexExternalUrl_content = '';
170
171 /**
172 * @todo Define visibility
173 */
174 public $cHashParams = array();
175
176 // cHashparams array
177 /**
178 * @todo Define visibility
179 */
180 public $freqRange = 32000;
181
182 /**
183 * @todo Define visibility
184 */
185 public $freqMax = 0.1;
186
187 /**
188 * @todo Define visibility
189 */
190 public $enableMetaphoneSearch = FALSE;
191
192 /**
193 * @todo Define visibility
194 */
195 public $storeMetaphoneInfoAsWords;
196
197 /**
198 * @todo Define visibility
199 */
200 public $metaphoneContent = '';
201
202 // Objects:
203 /**
204 * Charset class object
205 *
206 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
207 * @todo Define visibility
208 */
209 public $csObj;
210
211 /**
212 * Metaphone object, if any
213 *
214 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
215 * @todo Define visibility
216 */
217 public $metaphoneObj;
218
219 /**
220 * Lexer object for word splitting
221 *
222 * @var \TYPO3\CMS\IndexedSearch\Lexer
223 * @todo Define visibility
224 */
225 public $lexerObj;
226
227 /**
228 * @todo Define visibility
229 */
230 public $flagBitMask;
231
232 /**
233 * Parent Object (TSFE) Initialization
234 *
235 * @param object Parent Object (frontend TSFE object), passed by reference
236 * @return void
237 * @todo Define visibility
238 */
239 public function hook_indexContent(&$pObj) {
240 // Indexer configuration from Extension Manager interface:
241 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
242 // Crawler activation:
243 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
244 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
245 // Setting simple log message:
246 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
247 // Setting variables:
248 $this->crawlerActive = TRUE;
249 // Crawler active flag
250 $this->forceIndexing = TRUE;
251 }
252 // Determine if page should be indexed, and if so, configure and initialize indexer
253 if ($pObj->config['config']['index_enable']) {
254 $this->log_push('Index page', '');
255 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
256 if (!$pObj->page['no_search']) {
257 if (!$pObj->no_cache) {
258 if (!strcmp($pObj->sys_language_uid, $pObj->sys_language_content)) {
259 // Setting up internal configuration from config array:
260 $this->conf = array();
261 // Information about page for which the indexing takes place
262 $this->conf['id'] = $pObj->id;
263 // Page id
264 $this->conf['type'] = $pObj->type;
265 // Page type
266 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
267 // sys_language UID of the language of the indexing.
268 $this->conf['MP'] = $pObj->MP;
269 // MP variable, if any (Mount Points)
270 $this->conf['gr_list'] = $pObj->gr_list;
271 // Group list
272 $this->conf['cHash'] = $pObj->cHash;
273 // cHash string for additional parameters
274 $this->conf['cHash_array'] = $pObj->cHash_array;
275 // Array of the additional parameters
276 $this->conf['crdate'] = $pObj->page['crdate'];
277 // The creation date of the TYPO3 page
278 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
279 // reg1 of the caching table. Not known what practical use this has.
280 // Root line uids
281 $this->conf['rootline_uids'] = array();
282 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
283 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
284 }
285 // Content of page:
286 $this->conf['content'] = $pObj->content;
287 // Content string (HTML of TYPO3 page)
288 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
289 // Alternative title for indexing
290 $this->conf['metaCharset'] = $pObj->metaCharset;
291 // Character set of content (will be converted to utf-8 during indexing)
292 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];
293 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
294 // Configuration of behavior:
295 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
296 // Whether to index external documents like PDF, DOC etc. (if possible)
297 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
298 // Length of description text (max 250, default 200)
299 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
300 // Set to zero:
301 $this->conf['recordUid'] = 0;
302 $this->conf['freeIndexUid'] = 0;
303 $this->conf['freeIndexSetId'] = 0;
304 // Init and start indexing:
305 $this->init();
306 $this->indexTypo3PageContent();
307 } else {
308 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
309 }
310 } else {
311 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
312 }
313 } else {
314 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
315 }
316 } else {
317 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
318 }
319 $this->log_pull();
320 }
321 }
322
323 /****************************
324 *
325 * Backend API
326 *
327 ****************************/
328 /**
329 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
330 *
331 * @param integer The page uid, &id=
332 * @param integer The page type, &type=
333 * @param integer sys_language uid, typically &L=
334 * @param string The MP variable (Mount Points), &MP=
335 * @param array Rootline array of only UIDs.
336 * @param array Array of GET variables to register with this indexing
337 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
338 * @return void
339 * @todo Define visibility
340 */
341 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = FALSE) {
342 // Setting up internal configuration from config array:
343 $this->conf = array();
344 // Information about page for which the indexing takes place
345 $this->conf['id'] = $id;
346 // Page id (integer)
347 $this->conf['type'] = $type;
348 // Page type (integer)
349 $this->conf['sys_language_uid'] = $sys_language_uid;
350 // sys_language UID of the language of the indexing (integer)
351 $this->conf['MP'] = $MP;
352 // MP variable, if any (Mount Points) (string)
353 $this->conf['gr_list'] = '0,-1';
354 // Group list (hardcoded for now...)
355 // cHash values:
356 if ($createCHash) {
357 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
358 $cacheHash = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\CacheHashCalculator');
359 $this->conf['cHash'] = $cacheHash->generateForParameters(\TYPO3\CMS\Core\Utility\GeneralUtility::implodeArrayForUrl('', $cHash_array));
360 } else {
361 $this->conf['cHash'] = '';
362 }
363 // cHash string for additional parameters
364 $this->conf['cHash_array'] = $cHash_array;
365 // Array of the additional parameters
366 // Set to defaults
367 $this->conf['freeIndexUid'] = 0;
368 $this->conf['freeIndexSetId'] = 0;
369 $this->conf['page_cache_reg1'] = '';
370 // Root line uids
371 $this->conf['rootline_uids'] = $uidRL;
372 // Configuration of behavior:
373 $this->conf['index_externals'] = 1;
374 // Whether to index external documents like PDF, DOC etc. (if possible)
375 $this->conf['index_descrLgd'] = 200;
376 // Length of description text (max 250, default 200)
377 $this->conf['index_metatags'] = TRUE;
378 // Whether to index document keywords and description (if present)
379 // Init and start indexing:
380 $this->init();
381 }
382
383 /**
384 * Sets the free-index uid. Can be called right after backend_initIndexer()
385 *
386 * @param integer Free index UID
387 * @param integer Set id - an integer identifying the "set" of indexing operations.
388 * @return void
389 * @todo Define visibility
390 */
391 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0) {
392 $this->conf['freeIndexUid'] = $freeIndexUid;
393 $this->conf['freeIndexSetId'] = $freeIndexSetId;
394 }
395
396 /**
397 * Indexing records as the content of a TYPO3 page.
398 *
399 * @param string Title equivalent
400 * @param string Keywords equivalent
401 * @param string Description equivalent
402 * @param string The main content to index
403 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
404 * @param integer Last modification time, in seconds
405 * @param integer The creation date of the content, in seconds
406 * @param integer The record UID that the content comes from (for registration with the indexed rows)
407 * @return void
408 * @todo Define visibility
409 */
410 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0) {
411 // Content of page:
412 $this->conf['mtime'] = $mtime;
413 // Most recent modification time (seconds) of the content
414 $this->conf['crdate'] = $crdate;
415 // The creation date of the TYPO3 content
416 $this->conf['recordUid'] = $recordUid;
417 // UID of the record, if applicable
418 // Construct fake HTML for parsing:
419 $this->conf['content'] = '
420 <html>
421 <head>
422 <title>' . htmlspecialchars($title) . '</title>
423 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
424 <meta name="description" content="' . htmlspecialchars($description) . '" />
425 </head>
426 <body>
427 ' . htmlspecialchars($content) . '
428 </body>
429 </html>';
430 // Content string (HTML of TYPO3 page)
431 // Initializing charset:
432 $this->conf['metaCharset'] = $charset;
433 // Character set of content (will be converted to utf-8 during indexing)
434 $this->conf['indexedDocTitle'] = '';
435 // Alternative title for indexing
436 // Index content as if it was a TYPO3 page:
437 $this->indexTypo3PageContent();
438 }
439
440 /********************************
441 *
442 * Initialization
443 *
444 *******************************/
445 /**
446 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
447 *
448 * @return void
449 * @todo Define visibility
450 */
451 public function init() {
452 global $TYPO3_CONF_VARS;
453 // Initializing:
454 $this->cHashParams = $this->conf['cHash_array'];
455 if (is_array($this->cHashParams) && count($this->cHashParams)) {
456 if ($this->conf['cHash']) {
457 // Add this so that URL's come out right...
458 $this->cHashParams['cHash'] = $this->conf['cHash'];
459 }
460 unset($this->cHashParams['encryptionKey']);
461 }
462 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
463 $this->setT3Hashes();
464 // Indexer configuration from Extension Manager interface:
465 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
466 $this->tstamp_minAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
467 $this->tstamp_maxAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
468 $this->maxExternalFiles = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
469 $this->flagBitMask = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
470 // Workaround: If the extension configuration was not updated yet, the value is not existing
471 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
472 $this->storeMetaphoneInfoAsWords = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
473 // Initialize external document parsers:
474 // Example configuration, see ext_localconf.php of this file!
475 if ($this->conf['index_externals']) {
476 $this->initializeExternalParsers();
477 }
478 // Initialize lexer (class that deconstructs the text into words):
479 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
480 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ? $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] : 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
481 $this->lexerObj = \TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($lexerObjRef);
482 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
483 // Initialize metaphone hook:
484 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
485 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
486 if ($this->enableMetaphoneSearch && $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
487 $this->metaphoneObj = \TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
488 $this->metaphoneObj->pObj = $this;
489 }
490 // Init charset class:
491 $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Charset\\CharsetConverter');
492 }
493
494 /**
495 * Initialize external parsers
496 *
497 * @return void
498 * @access private
499 * @see init()
500 * @todo Define visibility
501 */
502 public function initializeExternalParsers() {
503 global $TYPO3_CONF_VARS;
504 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
505 foreach ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
506 $this->external_parsers[$extension] = \TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($_objRef);
507 $this->external_parsers[$extension]->pObj = $this;
508 // Init parser and if it returns FALSE, unset its entry again:
509 if (!$this->external_parsers[$extension]->initParser($extension)) {
510 unset($this->external_parsers[$extension]);
511 }
512 }
513 }
514 }
515
516 /********************************
517 *
518 * Indexing; TYPO3 pages (HTML content)
519 *
520 *******************************/
521 /**
522 * Start indexing of the TYPO3 page
523 *
524 * @return void
525 * @todo Define visibility
526 */
527 public function indexTypo3PageContent() {
528 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
529 $is_grlist = $this->is_grlist_set($this->hash['phash']);
530 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
531 // Setting message:
532 if ($this->forceIndexing) {
533 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
534 } elseif ($check > 0) {
535 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
536 } else {
537 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
538 }
539 // Divide into title,keywords,description and body:
540 $this->log_push('Split content', '');
541 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
542 if ($this->conf['indexedDocTitle']) {
543 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
544 }
545 $this->log_pull();
546 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
547 $this->content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
548 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
549 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
550 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
551 $checkCHash = $this->checkContentHash();
552 if (!is_array($checkCHash) || $check === 1) {
553 $Pstart = \TYPO3\CMS\Core\Utility\GeneralUtility::milliseconds();
554 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
555 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
556 $this->log_pull();
557 // Splitting words
558 $this->log_push('Extract words from content', '');
559 $splitInWords = $this->processWordsInArrays($this->contentParts);
560 $this->log_pull();
561 // Analyse the indexed words.
562 $this->log_push('Analyse the extracted words', '');
563 $indexArr = $this->indexAnalyze($splitInWords);
564 $this->log_pull();
565 // Submitting page (phash) record
566 $this->log_push('Submitting page', '');
567 $this->submitPage();
568 $this->log_pull();
569 // Check words and submit to word list if not there
570 $this->log_push('Check word list and submit words', '');
571 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
572 $this->checkWordList($indexArr);
573 $this->submitWords($indexArr, $this->hash['phash']);
574 }
575 $this->log_pull();
576 // Set parsetime
577 $this->updateParsetime($this->hash['phash'], \TYPO3\CMS\Core\Utility\GeneralUtility::milliseconds() - $Pstart);
578 // Checking external files if configured for.
579 $this->log_push('Checking external files', '');
580 if ($this->conf['index_externals']) {
581 $this->extractLinks($this->conf['content']);
582 }
583 $this->log_pull();
584 } else {
585 // Update the timestamp
586 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
587 $this->updateSetId($this->hash['phash']);
588 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
589 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
590 $this->updateRootline();
591 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
592 }
593 } else {
594 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
595 }
596 }
597
598 /**
599 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
600 *
601 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
602 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
603 * @see splitRegularContent()
604 * @todo Define visibility
605 */
606 public function splitHTMLContent($content) {
607 // divide head from body ( u-ouh :) )
608 $contentArr = $this->defaultContentArray;
609 $contentArr['body'] = stristr($content, '<body');
610 $headPart = substr($content, 0, -strlen($contentArr['body']));
611 // get title
612 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
613 $titleParts = explode(':', $contentArr['title'], 2);
614 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
615 // get keywords and description metatags
616 if ($this->conf['index_metatags']) {
617 $meta = array();
618 $i = 0;
619 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
620 $i++;
621 }
622 // TODO The code below stops at first unset tag. Is that correct?
623 for ($i = 0; isset($meta[$i]); $i++) {
624 $meta[$i] = \TYPO3\CMS\Core\Utility\GeneralUtility::get_tag_attributes($meta[$i]);
625 if (stristr($meta[$i]['name'], 'keywords')) {
626 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
627 }
628 if (stristr($meta[$i]['name'], 'description')) {
629 $contentArr['description'] .= ',' . $meta[$i]['content'];
630 }
631 }
632 }
633 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
634 $this->typoSearchTags($contentArr['body']);
635 // Get rid of unwanted sections (ie. scripting and style stuff) in body
636 $tagList = explode(',', $this->excludeSections);
637 foreach ($tagList as $tag) {
638 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
639
640 }
641 }
642 // remove tags, but first make sure we don't concatenate words by doing it
643 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
644 $contentArr['body'] = trim(strip_tags($contentArr['body']));
645 $contentArr['keywords'] = trim($contentArr['keywords']);
646 $contentArr['description'] = trim($contentArr['description']);
647 // Return array
648 return $contentArr;
649 }
650
651 /**
652 * Extract the charset value from HTML meta tag.
653 *
654 * @param string HTML content
655 * @return string The charset value if found.
656 * @todo Define visibility
657 */
658 public function getHTMLcharset($content) {
659 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
660 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
661 return $reg2[1];
662 }
663 }
664 }
665
666 /**
667 * Converts a HTML document to utf-8
668 *
669 * @param string HTML content, any charset
670 * @param string Optional charset (otherwise extracted from HTML)
671 * @return string Converted HTML
672 * @todo Define visibility
673 */
674 public function convertHTMLToUtf8($content, $charset = '') {
675 // Find charset:
676 $charset = $charset ? $charset : $this->getHTMLcharset($content);
677 $charset = $this->csObj->parse_charset($charset);
678 // Convert charset:
679 if ($charset && $charset !== 'utf-8') {
680 $content = $this->csObj->utf8_encode($content, $charset);
681 }
682 // Convert entities, assuming document is now UTF-8:
683 $content = $this->csObj->entities_to_utf8($content, TRUE);
684 return $content;
685 }
686
687 /**
688 * Finds first occurence of embracing tags and returns the embraced content and the original string with
689 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
690 * <title> of document or removing <script>-sections
691 *
692 * @param string String to search in
693 * @param string Tag name, eg. "script
694 * @param string Passed by reference: Content inside found tag
695 * @param string Passed by reference: Content after found tag
696 * @param string Passed by reference: Attributes of the found tag.
697 * @return boolean Returns FALSE if tag was not found, otherwise TRUE.
698 * @todo Define visibility
699 */
700 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
701 $endTag = '</' . $tagName . '>';
702 $startTag = '<' . $tagName;
703 // stristr used because we want a case-insensitive search for the tag.
704 $isTagInText = stristr($string, $startTag);
705 // if the tag was not found, return FALSE
706 if (!$isTagInText) {
707 return FALSE;
708 }
709 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
710 $afterTagInText = stristr($isTagInText, $endTag);
711 if ($afterTagInText) {
712 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
713 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
714 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
715 } else {
716 $tagContent = '';
717 $stringAfter = $isTagInText;
718 }
719 return TRUE;
720 }
721
722 /**
723 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
724 *
725 * @param string HTML Content, passed by reference
726 * @return boolean Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
727 * @todo Define visibility
728 */
729 public function typoSearchTags(&$body) {
730 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
731 if (count($expBody) > 1) {
732 $body = '';
733 foreach ($expBody as $val) {
734 $part = explode('-->', $val, 2);
735 if (trim($part[0]) == 'begin') {
736 $body .= $part[1];
737 $prev = '';
738 } elseif (trim($part[0]) == 'end') {
739 $body .= $prev;
740 } else {
741 $prev = $val;
742 }
743 }
744 return TRUE;
745 } else {
746 return FALSE;
747 }
748 }
749
750 /**
751 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
752 *
753 * @param string HTML content
754 * @return void
755 * @todo Define visibility
756 */
757 public function extractLinks($content) {
758 // Get links:
759 $list = $this->extractHyperLinks($content);
760 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
761 $this->includeCrawlerClass();
762 $crawler = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_lib');
763 }
764 // Traverse links:
765 foreach ($list as $linkInfo) {
766 // Decode entities:
767 if ($linkInfo['localPath']) {
768 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
769 $linkSource = \TYPO3\CMS\Core\Utility\GeneralUtility::htmlspecialchars_decode($linkInfo['localPath']);
770 } else {
771 $linkSource = \TYPO3\CMS\Core\Utility\GeneralUtility::htmlspecialchars_decode($linkInfo['href']);
772 }
773 // Parse URL:
774 $qParts = parse_url($linkSource);
775 // Check for jumpurl (TYPO3 specific thing...)
776 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
777 parse_str($qParts['query'], $getP);
778 $linkSource = $getP['jumpurl'];
779 $qParts = parse_url($linkSource);
780 }
781 if (!$linkInfo['localPath'] && $qParts['scheme']) {
782 if ($this->indexerConfig['indexExternalURLs']) {
783 // Index external URL (http or otherwise)
784 $this->indexExternalUrl($linkSource);
785 }
786 } elseif (!$qParts['query']) {
787 $linkSource = urldecode($linkSource);
788 if (\TYPO3\CMS\Core\Utility\GeneralUtility::isAllowedAbsPath($linkSource)) {
789 $localFile = $linkSource;
790 } else {
791 $localFile = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
792 }
793 if ($localFile && @is_file($localFile)) {
794 // Index local file:
795 if ($linkInfo['localPath']) {
796 $fI = pathinfo($linkSource);
797 $ext = strtolower($fI['extension']);
798 if (is_object($crawler)) {
799 $params = array(
800 'document' => $linkSource,
801 'alturl' => $linkInfo['href'],
802 'conf' => $this->conf
803 );
804 unset($params['conf']['content']);
805 $crawler->addQueueEntry_callBack(0, $params, 'EXT:indexed_search/class.crawler.php:&TYPO3\\CMS\\IndexedSearch\\Controller\\SearchFormController_files', $this->conf['id']);
806 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
807 } else {
808 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
809 }
810 } else {
811 if (is_object($crawler)) {
812 $params = array(
813 'document' => $linkSource,
814 'conf' => $this->conf
815 );
816 unset($params['conf']['content']);
817 $crawler->addQueueEntry_callBack(0, $params, 'EXT:indexed_search/class.crawler.php:&TYPO3\\CMS\\IndexedSearch\\Controller\\SearchFormController_files', $this->conf['id']);
818 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
819 } else {
820 $this->indexRegularDocument($linkSource);
821 }
822 }
823 }
824 }
825 }
826 }
827
828 /**
829 * Extracts all links to external documents from the HTML content string
830 *
831 * @param string $html
832 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
833 * @see extractLinks()
834 * @todo Define visibility
835 */
836 public function extractHyperLinks($html) {
837 $htmlParser = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Core\Html\HtmlParser');
838 $htmlParts = $htmlParser->splitTags('a', $html);
839 $hyperLinksData = array();
840 foreach ($htmlParts as $index => $tagData) {
841 if ($index % 2 !== 0) {
842 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
843 $firstTagName = $htmlParser->getFirstTagName($tagData);
844 if (strtolower($firstTagName) == 'a') {
845 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
846 $hyperLinksData[] = array(
847 'tag' => $tagData,
848 'href' => $tagAttributes[0]['href'],
849 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
850 );
851 }
852 }
853 }
854 }
855 return $hyperLinksData;
856 }
857
858 /**
859 * Extracts the "base href" from content string.
860 *
861 * @param string Content to analyze
862 * @return string The base href or an empty string if not found
863 */
864 public function extractBaseHref($html) {
865 $href = '';
866 $htmlParser = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Core\Html\HtmlParser');
867 $htmlParts = $htmlParser->splitTags('base', $html);
868 foreach ($htmlParts as $index => $tagData) {
869 if ($index % 2 !== 0) {
870 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
871 $firstTagName = $htmlParser->getFirstTagName($tagData);
872 if (strtolower($firstTagName) == 'base') {
873 $href = $tagAttributes[0]['href'];
874 if ($href) {
875 break;
876 }
877 }
878 }
879 }
880 return $href;
881 }
882
883 /******************************************
884 *
885 * Indexing; external URL
886 *
887 ******************************************/
888 /**
889 * Index External URLs HTML content
890 *
891 * @param string URL, eg. "http://typo3.org/
892 * @return void
893 * @see indexRegularDocument()
894 * @todo Define visibility
895 */
896 public function indexExternalUrl($externalUrl) {
897 // Parse External URL:
898 $qParts = parse_url($externalUrl);
899 $fI = pathinfo($qParts['path']);
900 $ext = strtolower($fI['extension']);
901 // Get headers:
902 $urlHeaders = $this->getUrlHeaders($externalUrl);
903 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
904 $content = ($this->indexExternalUrl_content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($externalUrl));
905 if (strlen($content)) {
906 // Create temporary file:
907 $tmpFile = \TYPO3\CMS\Core\Utility\GeneralUtility::tempnam('EXTERNAL_URL');
908 if ($tmpFile) {
909 \TYPO3\CMS\Core\Utility\GeneralUtility::writeFile($tmpFile, $content);
910 // Index that file:
911 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
912 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
913 unlink($tmpFile);
914 }
915 }
916 }
917 }
918
919 /**
920 * Getting HTTP request headers of URL
921 *
922 * @param string The URL
923 * @param integer Timeout (seconds?)
924 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
925 * @todo Define visibility
926 */
927 public function getUrlHeaders($url) {
928 // Try to get the headers only
929 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($url, 2);
930 if (strlen($content)) {
931 // Compile headers:
932 $headers = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(LF, $content, 1);
933 $retVal = array();
934 foreach ($headers as $line) {
935 if (!strlen(trim($line))) {
936 break;
937 }
938 list($headKey, $headValue) = explode(':', $line, 2);
939 $retVal[$headKey] = $headValue;
940 }
941 return $retVal;
942 }
943 }
944
945 /**
946 * Checks if the file is local
947 *
948 * @param $sourcePath
949 * @return string Absolute path to file if file is local, else empty string
950 */
951 protected function createLocalPath($sourcePath) {
952 $localPath = '';
953 static $pathFunctions = array(
954 'createLocalPathFromT3vars',
955 'createLocalPathUsingAbsRefPrefix',
956 'createLocalPathUsingDomainURL',
957 'createLocalPathFromAbsoluteURL',
958 'createLocalPathFromRelativeURL'
959 );
960 foreach ($pathFunctions as $functionName) {
961 $localPath = $this->{$functionName}($sourcePath);
962 if ($localPath != '') {
963 break;
964 }
965 }
966 return $localPath;
967 }
968
969 /**
970 * Attempts to create a local file path from T3VARs. This is useful for
971 * various download extensions that hide actual file name but still want the
972 * file to be indexed.
973 *
974 * @param string $sourcePath
975 * @return string
976 */
977 protected function createLocalPathFromT3vars($sourcePath) {
978 $localPath = '';
979 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
980 if (is_array($indexLocalFiles)) {
981 $md5 = \TYPO3\CMS\Core\Utility\GeneralUtility::shortMD5($sourcePath);
982 // Note: not using self::isAllowedLocalFile here because this method
983 // is allowed to index files outside of the web site (for example,
984 // protected downloads)
985 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
986 $localPath = $indexLocalFiles[$md5];
987 }
988 }
989 return $localPath;
990 }
991
992 /**
993 * Attempts to create a local file path by matching a current request URL.
994 *
995 * @param string $sourcePath
996 * @return string
997 */
998 protected function createLocalPathUsingDomainURL($sourcePath) {
999 $localPath = '';
1000 $baseURL = \TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1001 $baseURLLength = strlen($baseURL);
1002 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1003 $sourcePath = substr($sourcePath, $baseURLLength);
1004 $localPath = PATH_site . $sourcePath;
1005 if (!self::isAllowedLocalFile($localPath)) {
1006 $localPath = '';
1007 }
1008 }
1009 return $localPath;
1010 }
1011
1012 /**
1013 * Attempts to create a local file path by matching absRefPrefix. This
1014 * requires TSFE. If TSFE is missing, this function does nothing.
1015 *
1016 * @param string $sourcePath
1017 * @return string
1018 */
1019 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
1020 $localPath = '';
1021 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1022 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1023 $absRefPrefixLength = strlen($absRefPrefix);
1024 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1025 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1026 $localPath = PATH_site . $sourcePath;
1027 if (!self::isAllowedLocalFile($localPath)) {
1028 $localPath = '';
1029 }
1030 }
1031 }
1032 return $localPath;
1033 }
1034
1035 /**
1036 * Attempts to create a local file path from the absolute URL without
1037 * schema.
1038 *
1039 * @param string $sourcePath
1040 * @return string
1041 */
1042 protected function createLocalPathFromAbsoluteURL($sourcePath) {
1043 $localPath = '';
1044 if ($sourcePath[0] == '/') {
1045 $sourcePath = substr($sourcePath, 1);
1046 $localPath = PATH_site . $sourcePath;
1047 if (!self::isAllowedLocalFile($localPath)) {
1048 $localPath = '';
1049 }
1050 }
1051 return $localPath;
1052 }
1053
1054 /**
1055 * Attempts to create a local file path from the relative URL.
1056 *
1057 * @param string $sourcePath
1058 * @return string
1059 */
1060 protected function createLocalPathFromRelativeURL($sourcePath) {
1061 $localPath = '';
1062 if (self::isRelativeURL($sourcePath)) {
1063 $localPath = PATH_site . $sourcePath;
1064 if (!self::isAllowedLocalFile($localPath)) {
1065 $localPath = '';
1066 }
1067 }
1068 return $localPath;
1069 }
1070
1071 /**
1072 * Checks if URL is relative.
1073 *
1074 * @param string $url
1075 * @return boolean
1076 */
1077 static protected function isRelativeURL($url) {
1078 $urlParts = @parse_url($url);
1079 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1080 }
1081
1082 /**
1083 * Checks if the path points to the file inside the web site
1084 *
1085 * @param string $filePath
1086 * @return boolean
1087 */
1088 static protected function isAllowedLocalFile($filePath) {
1089 $filePath = \TYPO3\CMS\Core\Utility\GeneralUtility::resolveBackPath($filePath);
1090 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1091 $isFile = is_file($filePath);
1092 return $insideWebPath && $isFile;
1093 }
1094
1095 /******************************************
1096 *
1097 * Indexing; external files (PDF, DOC, etc)
1098 *
1099 ******************************************/
1100 /**
1101 * Indexing a regular document given as $file (relative to PATH_site, local file)
1102 *
1103 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1104 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1105 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1106 * @param string File extension for temporary file.
1107 * @return void
1108 * @todo Define visibility
1109 */
1110 public function indexRegularDocument($file, $force = FALSE, $contentTmpFile = '', $altExtension = '') {
1111 // Init
1112 $fI = pathinfo($file);
1113 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
1114 // Create abs-path:
1115 if (!$contentTmpFile) {
1116 if (!\TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath($file)) {
1117 // Relative, prepend PATH_site:
1118 $absFile = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName(PATH_site . $file);
1119 } else {
1120 // Absolute, pass-through:
1121 $absFile = $file;
1122 }
1123 $absFile = \TYPO3\CMS\Core\Utility\GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1124 } else {
1125 $absFile = $contentTmpFile;
1126 }
1127 // Indexing the document:
1128 if ($absFile && @is_file($absFile)) {
1129 if ($this->external_parsers[$ext]) {
1130 $mtime = filemtime($absFile);
1131 $cParts = $this->fileContentParts($ext, $absFile);
1132 foreach ($cParts as $cPKey) {
1133 $this->internal_log = array();
1134 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1135 $Pstart = \TYPO3\CMS\Core\Utility\GeneralUtility::milliseconds();
1136 $subinfo = array('key' => $cPKey);
1137 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1138 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1139 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1140 if ($check > 0 || $force) {
1141 if ($check > 0) {
1142 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1143 } else {
1144 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1145 }
1146 // Check external file counter:
1147 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1148 // Divide into title,keywords,description and body:
1149 $this->log_push('Split content', '');
1150 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1151 $this->log_pull();
1152 if (is_array($contentParts)) {
1153 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1154 $content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1155 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1156 // Increment counter:
1157 $this->externalFileCounter++;
1158 // Splitting words
1159 $this->log_push('Extract words from content', '');
1160 $splitInWords = $this->processWordsInArrays($contentParts);
1161 $this->log_pull();
1162 // Analyse the indexed words.
1163 $this->log_push('Analyse the extracted words', '');
1164 $indexArr = $this->indexAnalyze($splitInWords);
1165 $this->log_pull();
1166 // Submitting page (phash) record
1167 $this->log_push('Submitting page', '');
1168 $size = filesize($absFile);
1169 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1170 $ctime = filemtime($absFile);
1171 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts);
1172 $this->log_pull();
1173 // Check words and submit to word list if not there
1174 $this->log_push('Check word list and submit words', '');
1175 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1176 $this->checkWordList($indexArr);
1177 $this->submitWords($indexArr, $phash_arr['phash']);
1178 }
1179 $this->log_pull();
1180 // Set parsetime
1181 $this->updateParsetime($phash_arr['phash'], \TYPO3\CMS\Core\Utility\GeneralUtility::milliseconds() - $Pstart);
1182 } else {
1183 // Update the timestamp
1184 $this->updateTstamp($phash_arr['phash'], $mtime);
1185 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1186 }
1187 } else {
1188 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1189 }
1190 } else {
1191 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1192 }
1193 } else {
1194 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1195 }
1196 // Checking and setting sections:
1197 $this->submitFile_section($phash_arr['phash']);
1198 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1199 $this->log_pull();
1200 }
1201 } else {
1202 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1203 }
1204 } else {
1205 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1206 }
1207 }
1208
1209 /**
1210 * Reads the content of an external file being indexed.
1211 * The content from the external parser MUST be returned in utf-8!
1212 *
1213 * @param string File extension, eg. "pdf", "doc" etc.
1214 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1215 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1216 * @return array Standard content array (title, description, keywords, body keys)
1217 * @todo Define visibility
1218 */
1219 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1220 $contentArray = NULL;
1221 // Consult relevant external document parser:
1222 if (is_object($this->external_parsers[$fileExtension])) {
1223 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1224 }
1225 return $contentArray;
1226 }
1227
1228 /**
1229 * Creates an array with pointers to divisions of document.
1230 *
1231 * @param string File extension
1232 * @param string Absolute filename (must exist and be validated OK before calling function)
1233 * @return array Array of pointers to sections that the document should be divided into
1234 * @todo Define visibility
1235 */
1236 public function fileContentParts($ext, $absFile) {
1237 $cParts = array(0);
1238 // Consult relevant external document parser:
1239 if (is_object($this->external_parsers[$ext])) {
1240 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1241 }
1242 return $cParts;
1243 }
1244
1245 /**
1246 * Splits non-HTML content (from external files for instance)
1247 *
1248 * @param string Input content (non-HTML) to index.
1249 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1250 * @see splitHTMLContent()
1251 * @todo Define visibility
1252 */
1253 public function splitRegularContent($content) {
1254 $contentArr = $this->defaultContentArray;
1255 $contentArr['body'] = $content;
1256 return $contentArr;
1257 }
1258
1259 /**********************************
1260 *
1261 * Analysing content, Extracting words
1262 *
1263 **********************************/
1264 /**
1265 * Convert character set and HTML entities in the value of input content array keys
1266 *
1267 * @param array Standard content array
1268 * @param string Charset of the input content (converted to utf-8)
1269 * @return void
1270 * @todo Define visibility
1271 */
1272 public function charsetEntity2utf8(&$contentArr, $charset) {
1273 // Convert charset if necessary
1274 foreach ($contentArr as $key => $value) {
1275 if (strlen($contentArr[$key])) {
1276 if ($charset !== 'utf-8') {
1277 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1278 }
1279 // decode all numeric / html-entities in the string to real characters:
1280 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1281 }
1282 }
1283 }
1284
1285 /**
1286 * Processing words in the array from split*Content -functions
1287 *
1288 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1289 * @return array Content input array modified so each key is not a unique array of words
1290 * @todo Define visibility
1291 */
1292 public function processWordsInArrays($contentArr) {
1293 // split all parts to words
1294 foreach ($contentArr as $key => $value) {
1295 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1296 }
1297 // For title, keywords, and description we don't want duplicates:
1298 $contentArr['title'] = array_unique($contentArr['title']);
1299 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1300 $contentArr['description'] = array_unique($contentArr['description']);
1301 // Return modified array:
1302 return $contentArr;
1303 }
1304
1305 /**
1306 * Extracts the sample description text from the content array.
1307 *
1308 * @param array Content array
1309 * @return string Description string
1310 * @todo Define visibility
1311 */
1312 public function bodyDescription($contentArr) {
1313 // Setting description
1314 $maxL = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1315 if ($maxL) {
1316 $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1317 // Shorten the string:
1318 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1319 }
1320 return $bodyDescription;
1321 }
1322
1323 /**
1324 * Analyzes content to use for indexing,
1325 *
1326 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1327 * @return array Index Array (whatever that is...)
1328 * @todo Define visibility
1329 */
1330 public function indexAnalyze($content) {
1331 $indexArr = array();
1332 $counter = 0;
1333 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1334 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1335 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1336 $this->analyzeBody($indexArr, $content);
1337 return $indexArr;
1338 }
1339
1340 /**
1341 * Calculates relevant information for headercontent
1342 *
1343 * @param array Index array, passed by reference
1344 * @param array Standard content array
1345 * @param string Key from standard content array
1346 * @param integer Bit-wise priority to type
1347 * @return void
1348 * @todo Define visibility
1349 */
1350 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1351 foreach ($content[$key] as $val) {
1352 $val = substr($val, 0, 60);
1353 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1354 if (!isset($retArr[$val])) {
1355 // Word ID (wid)
1356 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1357 // Metaphone value is also 60 only chars long
1358 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1359 $retArr[$val]['metaphone'] = $metaphone;
1360 }
1361 // Build metaphone fulltext string (can be used for fulltext indexing)
1362 if ($this->storeMetaphoneInfoAsWords) {
1363 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1364 }
1365 // Priority used for flagBitMask feature (see extension configuration)
1366 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1367 // Increase number of occurences
1368 $retArr[$val]['count']++;
1369 $this->wordcount++;
1370 }
1371 }
1372
1373 /**
1374 * Calculates relevant information for bodycontent
1375 *
1376 * @param array Index array, passed by reference
1377 * @param array Standard content array
1378 * @return void
1379 * @todo Define visibility
1380 */
1381 public function analyzeBody(&$retArr, $content) {
1382 foreach ($content['body'] as $key => $val) {
1383 $val = substr($val, 0, 60);
1384 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1385 if (!isset($retArr[$val])) {
1386 // First occurence (used for ranking results)
1387 $retArr[$val]['first'] = $key;
1388 // Word ID (wid)
1389 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1390 // Metaphone value is also only 60 chars long
1391 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1392 $retArr[$val]['metaphone'] = $metaphone;
1393 }
1394 // Build metaphone fulltext string (can be used for fulltext indexing)
1395 if ($this->storeMetaphoneInfoAsWords) {
1396 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1397 }
1398 // Increase number of occurences
1399 $retArr[$val]['count']++;
1400 $this->wordcount++;
1401 }
1402 }
1403
1404 /**
1405 * Creating metaphone based hash from input word
1406 *
1407 * @param string Word to convert
1408 * @param boolean If set, returns the raw metaphone value (not hashed)
1409 * @return mixed Metaphone hash integer (or raw value, string)
1410 * @todo Define visibility
1411 */
1412 public function metaphone($word, $returnRawMetaphoneValue = FALSE) {
1413 if (is_object($this->metaphoneObj)) {
1414 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1415 } else {
1416 // Use native PHP function instead of advanced doubleMetaphone class
1417 $metaphoneRawValue = metaphone($word);
1418 }
1419 if ($returnRawMetaphoneValue) {
1420 $result = $metaphoneRawValue;
1421 } elseif (strlen($metaphoneRawValue)) {
1422 // Create hash and return integer
1423 $result = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($metaphoneRawValue);
1424 } else {
1425 $result = 0;
1426 }
1427 return $result;
1428 }
1429
1430 /********************************
1431 *
1432 * SQL; TYPO3 Pages
1433 *
1434 *******************************/
1435 /**
1436 * Updates db with information about the page (TYPO3 page, not external media)
1437 *
1438 * @return void
1439 * @todo Define visibility
1440 */
1441 public function submitPage() {
1442 // Remove any current data for this phash:
1443 $this->removeOldIndexedPages($this->hash['phash']);
1444 // setting new phash_row
1445 $fields = array(
1446 'phash' => $this->hash['phash'],
1447 'phash_grouping' => $this->hash['phash_grouping'],
1448 'cHashParams' => serialize($this->cHashParams),
1449 'contentHash' => $this->content_md5h,
1450 'data_page_id' => $this->conf['id'],
1451 'data_page_reg1' => $this->conf['page_cache_reg1'],
1452 'data_page_type' => $this->conf['type'],
1453 'data_page_mp' => $this->conf['MP'],
1454 'gr_list' => $this->conf['gr_list'],
1455 'item_type' => 0,
1456 // TYPO3 page
1457 'item_title' => $this->contentParts['title'],
1458 'item_description' => $this->bodyDescription($this->contentParts),
1459 'item_mtime' => $this->conf['mtime'],
1460 'item_size' => strlen($this->conf['content']),
1461 'tstamp' => $GLOBALS['EXEC_TIME'],
1462 'crdate' => $GLOBALS['EXEC_TIME'],
1463 'item_crdate' => $this->conf['crdate'],
1464 // Creation date of page
1465 'sys_language_uid' => $this->conf['sys_language_uid'],
1466 // Sys language uid of the page. Should reflect which language it DOES actually display!
1467 'externalUrl' => 0,
1468 'recordUid' => intval($this->conf['recordUid']),
1469 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1470 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1471 );
1472 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1473 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1474 }
1475 // PROCESSING index_section
1476 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1477 // PROCESSING index_grlist
1478 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1479 // PROCESSING index_fulltext
1480 $fields = array(
1481 'phash' => $this->hash['phash'],
1482 'fulltextdata' => implode(' ', $this->contentParts),
1483 'metaphonedata' => $this->metaphoneContent
1484 );
1485 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1486 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1487 }
1488 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1489 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1490 }
1491 // PROCESSING index_debug
1492 if ($this->indexerConfig['debugMode']) {
1493 $fields = array(
1494 'phash' => $this->hash['phash'],
1495 'debuginfo' => serialize(array(
1496 'cHashParams' => $this->cHashParams,
1497 'external_parsers initialized' => array_keys($this->external_parsers),
1498 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1499 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1500 'logs' => $this->internal_log,
1501 'lexer' => $this->lexerObj->debugString
1502 ))
1503 );
1504 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1505 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1506 }
1507 }
1508 }
1509
1510 /**
1511 * Stores gr_list in the database.
1512 *
1513 * @param integer Search result record phash
1514 * @param integer Actual phash of current content
1515 * @return void
1516 * @see update_grlist()
1517 * @todo Define visibility
1518 */
1519 public function submit_grlist($hash, $phash_x) {
1520 // Setting the gr_list record
1521 $fields = array(
1522 'phash' => $hash,
1523 'phash_x' => $phash_x,
1524 'hash_gr_list' => \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1525 'gr_list' => $this->conf['gr_list']
1526 );
1527 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1528 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1529 }
1530 }
1531
1532 /**
1533 * Stores section
1534 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1535 *
1536 * @param integer phash of TYPO3 parent search result record
1537 * @param integer phash of the file indexation search record
1538 * @return void
1539 * @todo Define visibility
1540 */
1541 public function submit_section($hash, $hash_t3) {
1542 $fields = array(
1543 'phash' => $hash,
1544 'phash_t3' => $hash_t3,
1545 'page_id' => intval($this->conf['id'])
1546 );
1547 $this->getRootLineFields($fields);
1548 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1549 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1550 }
1551 }
1552
1553 /**
1554 * Removes records for the indexed page, $phash
1555 *
1556 * @param integer phash value to flush
1557 * @return void
1558 * @todo Define visibility
1559 */
1560 public function removeOldIndexedPages($phash) {
1561 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1562 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1563 foreach ($tableArray as $table) {
1564 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1565 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1566 }
1567 }
1568 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1569 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1570 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . intval($phash));
1571 }
1572 }
1573
1574 /********************************
1575 *
1576 * SQL; External media
1577 *
1578 *******************************/
1579 /**
1580 * Updates db with information about the file
1581 *
1582 * @param array Array with phash and phash_grouping keys for file
1583 * @param string File name
1584 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1585 * @param string File extension determining the type of media.
1586 * @param integer Modification time of file.
1587 * @param integer Creation time of file.
1588 * @param integer Size of file in bytes
1589 * @param integer Content HASH value.
1590 * @param array Standard content array (using only title and body for a file)
1591 * @return void
1592 * @todo Define visibility
1593 */
1594 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1595 // Find item Type:
1596 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1597 $storeItemType = $storeItemType ? $storeItemType : $ext;
1598 // Remove any current data for this phash:
1599 $this->removeOldIndexedFiles($hash['phash']);
1600 // Split filename:
1601 $fileParts = parse_url($file);
1602 // Setting new
1603 $fields = array(
1604 'phash' => $hash['phash'],
1605 'phash_grouping' => $hash['phash_grouping'],
1606 'cHashParams' => serialize($subinfo),
1607 'contentHash' => $content_md5h,
1608 'data_filename' => $file,
1609 'item_type' => $storeItemType,
1610 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1611 'item_description' => $this->bodyDescription($contentParts),
1612 'item_mtime' => $mtime,
1613 'item_size' => $size,
1614 'item_crdate' => $ctime,
1615 'tstamp' => $GLOBALS['EXEC_TIME'],
1616 'crdate' => $GLOBALS['EXEC_TIME'],
1617 'gr_list' => $this->conf['gr_list'],
1618 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1619 'recordUid' => intval($this->conf['recordUid']),
1620 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1621 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1622 );
1623 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1624 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1625 }
1626 // PROCESSING index_fulltext
1627 $fields = array(
1628 'phash' => $hash['phash'],
1629 'fulltextdata' => implode(' ', $contentParts),
1630 'metaphonedata' => $this->metaphoneContent
1631 );
1632 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1633 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1634 }
1635 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1636 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1637 }
1638 // PROCESSING index_debug
1639 if ($this->indexerConfig['debugMode']) {
1640 $fields = array(
1641 'phash' => $hash['phash'],
1642 'debuginfo' => serialize(array(
1643 'cHashParams' => $subinfo,
1644 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1645 'logs' => $this->internal_log,
1646 'lexer' => $this->lexerObj->debugString
1647 ))
1648 );
1649 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1650 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1651 }
1652 }
1653 }
1654
1655 /**
1656 * Stores file gr_list for a file IF it does not exist already
1657 *
1658 * @param integer phash value of file
1659 * @return void
1660 * @todo Define visibility
1661 */
1662 public function submitFile_grlist($hash) {
1663 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1664 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1665 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($hash) . ' AND (hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1666 if ($count == 0) {
1667 $this->submit_grlist($hash, $hash);
1668 }
1669 }
1670 }
1671
1672 /**
1673 * Stores file section for a file IF it does not exist
1674 *
1675 * @param integer phash value of file
1676 * @return void
1677 * @todo Define visibility
1678 */
1679 public function submitFile_section($hash) {
1680 // Testing if there is already a section
1681 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1682 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . intval($hash) . ' AND page_id=' . intval($this->conf['id']));
1683 if ($count == 0) {
1684 $this->submit_section($hash, $this->hash['phash']);
1685 }
1686 }
1687 }
1688
1689 /**
1690 * Removes records for the indexed page, $phash
1691 *
1692 * @param integer phash value to flush
1693 * @return void
1694 * @todo Define visibility
1695 */
1696 public function removeOldIndexedFiles($phash) {
1697 // Removing old registrations for tables.
1698 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1699 foreach ($tableArray as $table) {
1700 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1701 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1702 }
1703 }
1704 }
1705
1706 /********************************
1707 *
1708 * SQL Helper functions
1709 *
1710 *******************************/
1711 /**
1712 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1713 * Return positive integer if the page needs to be indexed
1714 *
1715 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1716 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1717 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1718 * @todo Define visibility
1719 */
1720 public function checkMtimeTstamp($mtime, $phash) {
1721 if (!\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1722 // Not indexed (not in index_phash)
1723 $result = 4;
1724 } else {
1725 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . intval($phash));
1726 // If there was an indexing of the page...:
1727 if ($row) {
1728 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1729 // If max age is exceeded, index the page
1730 // The configured max-age was exceeded for the document and thus it's indexed.
1731 $result = 1;
1732 } else {
1733 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1734 // if minAge is not set or if minAge is exceeded, consider at mtime
1735 if ($mtime) {
1736 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1737 if ($row['item_mtime'] != $mtime) {
1738 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1739 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1740 $result = 2;
1741 } else {
1742 // mtime matched the document, so no changes detected and no content updated
1743 $result = -1;
1744 if ($this->tstamp_maxAge) {
1745 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1746 } else {
1747 $this->updateTstamp($phash);
1748 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1749 }
1750 }
1751 } else {
1752 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1753 $result = 3;
1754 }
1755 } else {
1756 // The minimum age was not exceeded
1757 $result = -2;
1758 }
1759 }
1760 } else {
1761 // Page has never been indexed (is not represented in the index_phash table).
1762 $result = 4;
1763 }
1764 }
1765 return $result;
1766 }
1767
1768 /**
1769 * Check content hash in phash table
1770 *
1771 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1772 * @todo Define visibility
1773 */
1774 public function checkContentHash() {
1775 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1776 $result = TRUE;
1777 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1778 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . intval($this->hash['phash_grouping']) . ' AND contentHash=' . intval($this->content_md5h));
1779 if ($row) {
1780 $result = $row;
1781 }
1782 }
1783 return $result;
1784 }
1785
1786 /**
1787 * Check content hash for external documents
1788 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1789 *
1790 * @param integer phash value to check (phash_grouping)
1791 * @param integer Content hash to check
1792 * @return boolean Returns TRUE if the document needs to be indexed (that is, there was no result)
1793 * @todo Define visibility
1794 */
1795 public function checkExternalDocContentHash($hashGr, $content_md5h) {
1796 $result = TRUE;
1797 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1798 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . intval($hashGr) . ' AND contentHash=' . intval($content_md5h));
1799 $result = $count == 0;
1800 }
1801 return $result;
1802 }
1803
1804 /**
1805 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1806 *
1807 * @param integer Phash integer to test.
1808 * @return boolean
1809 * @todo Define visibility
1810 */
1811 public function is_grlist_set($phash_x) {
1812 $result = FALSE;
1813 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1814 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . intval($phash_x));
1815 $result = $count > 0;
1816 }
1817 return $result;
1818 }
1819
1820 /**
1821 * Check if an grlist-entry for this hash exists and if not so, write one.
1822 *
1823 * @param integer phash of the search result that should be found
1824 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1825 * @return void
1826 * @see submit_grlist()
1827 * @todo Define visibility
1828 */
1829 public function update_grlist($phash, $phash_x) {
1830 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1831 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($phash) . ' AND hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1832 if ($count == 0) {
1833 $this->submit_grlist($phash, $phash_x);
1834 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1835 }
1836 }
1837 }
1838
1839 /**
1840 * Update tstamp for a phash row.
1841 *
1842 * @param integer phash value
1843 * @param integer If set, update the mtime field to this value.
1844 * @return void
1845 * @todo Define visibility
1846 */
1847 public function updateTstamp($phash, $mtime = 0) {
1848 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1849 $updateFields = array(
1850 'tstamp' => $GLOBALS['EXEC_TIME']
1851 );
1852 if ($mtime) {
1853 $updateFields['item_mtime'] = intval($mtime);
1854 }
1855 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1856 }
1857 }
1858
1859 /**
1860 * Update SetID of the index_phash record.
1861 *
1862 * @param integer phash value
1863 * @return void
1864 * @todo Define visibility
1865 */
1866 public function updateSetId($phash) {
1867 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1868 $updateFields = array(
1869 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1870 );
1871 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1872 }
1873 }
1874
1875 /**
1876 * Update parsetime for phash row.
1877 *
1878 * @param integer phash value.
1879 * @param integer Parsetime value to set.
1880 * @return void
1881 * @todo Define visibility
1882 */
1883 public function updateParsetime($phash, $parsetime) {
1884 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1885 $updateFields = array(
1886 'parsetime' => intval($parsetime)
1887 );
1888 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1889 }
1890 }
1891
1892 /**
1893 * Update section rootline for the page
1894 *
1895 * @return void
1896 * @todo Define visibility
1897 */
1898 public function updateRootline() {
1899 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1900 $updateFields = array();
1901 $this->getRootLineFields($updateFields);
1902 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . intval($this->conf['id']), $updateFields);
1903 }
1904 }
1905
1906 /**
1907 * Adding values for root-line fields.
1908 * rl0, rl1 and rl2 are standard. A hook might add more.
1909 *
1910 * @param array Field array, passed by reference
1911 * @return void
1912 * @todo Define visibility
1913 */
1914 public function getRootLineFields(array &$fieldArray) {
1915 $fieldArray['rl0'] = intval($this->conf['rootline_uids'][0]);
1916 $fieldArray['rl1'] = intval($this->conf['rootline_uids'][1]);
1917 $fieldArray['rl2'] = intval($this->conf['rootline_uids'][2]);
1918 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1919 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1920 $fieldArray[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1921 }
1922 }
1923 }
1924
1925 /**
1926 * Removes any indexed pages with userlogins which has the same contentHash
1927 * NOT USED anywhere inside this class!
1928 *
1929 * @return void
1930 * @todo Define visibility
1931 */
1932 public function removeLoginpagesWithContentHash() {
1933 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash') && \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1934 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1935 A.phash=B.phash
1936 AND A.phash_grouping=' . intval($this->hash['phash_grouping']) . '
1937 AND B.hash_gr_list<>' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . '
1938 AND A.contentHash=' . intval($this->content_md5h));
1939 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1940 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1941 $this->removeOldIndexedPages($row['phash']);
1942 }
1943 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1944 }
1945 }
1946
1947 /**
1948 * Includes the crawler class
1949 *
1950 * @return void
1951 * @todo Define visibility
1952 */
1953 public function includeCrawlerClass() {
1954 \TYPO3\CMS\Core\Utility\GeneralUtility::requireOnce(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php');
1955 }
1956
1957 /********************************
1958 *
1959 * SQL; Submitting words
1960 *
1961 *******************************/
1962 /**
1963 * Adds new words to db
1964 *
1965 * @param array $wordListArray Word List array (where each word has information about position etc).
1966 * @return void
1967 * @todo Define visibility
1968 */
1969 public function checkWordList($wordListArray) {
1970 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1971 if (count($wordListArray)) {
1972 $phashArray = array();
1973 foreach ($wordListArray as $value) {
1974 $phashArray[] = intval($value['hash']);
1975 }
1976 $cwl = implode(',', $phashArray);
1977 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1978 if ($count != count($wordListArray)) {
1979 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1980 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
1981 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1982 unset($wordListArray[$row['baseword']]);
1983 }
1984 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1985 foreach ($wordListArray as $key => $val) {
1986 $insertFields = array(
1987 'wid' => $val['hash'],
1988 'baseword' => $key,
1989 'metaphone' => $val['metaphone']
1990 );
1991 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1992 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1993 }
1994 }
1995 }
1996 }
1997 }
1998
1999 /**
2000 * Submits RELATIONS between words and phash
2001 *
2002 * @param array Word list array
2003 * @param integer phash value
2004 * @return void
2005 * @todo Define visibility
2006 */
2007 public function submitWords($wordList, $phash) {
2008 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_rel')) {
2009 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . intval($phash));
2010 foreach ($wordList as $val) {
2011 $insertFields = array(
2012 'phash' => $phash,
2013 'wid' => $val['hash'],
2014 'count' => $val['count'],
2015 'first' => $val['first'],
2016 'freq' => $this->freqMap($val['count'] / $this->wordcount),
2017 'flags' => $val['cmp'] & $this->flagBitMask
2018 );
2019 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2020 }
2021 }
2022 }
2023
2024 /**
2025 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2026 * and back.
2027 *
2028 * @param double Frequency
2029 * @return integer Frequency in range.
2030 * @todo Define visibility
2031 */
2032 public function freqMap($freq) {
2033 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2034 if ($freq < 1) {
2035 $newFreq = $freq * $mapFactor;
2036 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2037 } else {
2038 $newFreq = $freq / $mapFactor;
2039 }
2040 return $newFreq;
2041 }
2042
2043 /********************************
2044 *
2045 * Hashing
2046 *
2047 *******************************/
2048 /**
2049 * Get search hash, T3 pages
2050 *
2051 * @return void
2052 * @todo Define visibility
2053 */
2054 public function setT3Hashes() {
2055 // Set main array:
2056 $hArray = array(
2057 'id' => (int) $this->conf['id'],
2058 'type' => (int) $this->conf['type'],
2059 'sys_lang' => (int) $this->conf['sys_language_uid'],
2060 'MP' => (string) $this->conf['MP'],
2061 'cHash' => $this->cHashParams
2062 );
2063 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2064 $this->hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2065 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2066 $hArray['gr_list'] = (string) $this->conf['gr_list'];
2067 $this->hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2068 }
2069
2070 /**
2071 * Get search hash, external files
2072 *
2073 * @param string File name / path which identifies it on the server
2074 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2075 * @return array Array with "phash_grouping" and "phash" inside.
2076 * @todo Define visibility
2077 */
2078 public function setExtHashes($file, $subinfo = array()) {
2079 // Set main array:
2080 $hash = array();
2081 $hArray = array(
2082 'file' => $file
2083 );
2084 // Set grouping hash:
2085 $hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2086 // Add subinfo
2087 $hArray['subinfo'] = $subinfo;
2088 $hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2089 return $hash;
2090 }
2091
2092 /*********************************
2093 *
2094 * Internal logging functions
2095 *
2096 *********************************/
2097 /**
2098 * Push function wrapper for TT logging
2099 *
2100 * @param string Title to set
2101 * @param string Key (?)
2102 * @return void
2103 * @todo Define visibility
2104 */
2105 public function log_push($msg, $key) {
2106 if (is_object($GLOBALS['TT'])) {
2107 $GLOBALS['TT']->push($msg, $key);
2108 }
2109 }
2110
2111 /**
2112 * Pull function wrapper for TT logging
2113 *
2114 * @return void
2115 * @todo Define visibility
2116 */
2117 public function log_pull() {
2118 if (is_object($GLOBALS['TT'])) {
2119 $GLOBALS['TT']->pull();
2120 }
2121 }
2122
2123 /**
2124 * Set log message function wrapper for TT logging
2125 *
2126 * @param string Message to set
2127 * @param integer Error number
2128 * @return void
2129 * @todo Define visibility
2130 */
2131 public function log_setTSlogMessage($msg, $errorNum = 0) {
2132 if (is_object($GLOBALS['TT'])) {
2133 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2134 }
2135 $this->internal_log[] = $msg;
2136 }
2137
2138 /**************************
2139 *
2140 * tslib_fe hooks:
2141 *
2142 **************************/
2143 /**
2144 * Makes sure that keywords are space-separated. This is impotant for their
2145 * proper displaying as a part of fulltext index.
2146 *
2147 * @param string $keywordList
2148 * @return string
2149 * @see http://bugs.typo3.org/view.php?id=1436
2150 */
2151 protected function addSpacesToKeywordList($keywordList) {
2152 $keywords = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $keywordList);
2153 return ' ' . implode(', ', $keywords) . ' ';
2154 }
2155
2156 }
2157
2158
2159 ?>