[TASK] Rename ExtensionManager class Part 2
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the TYPO3 project. The TYPO3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 * A copy is found in the textfile GPL.txt and important notices to the license
19 * from the author is found in LICENSE.txt distributed with these scripts.
20 *
21 *
22 * This script is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * This copyright notice MUST APPEAR in all copies of the script!
28 ***************************************************************/
29 /**
30 * This class is a search indexer for TYPO3
31 *
32 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
33 */
34 /**
35 * Indexing class for TYPO3 frontend
36 *
37 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
38 * @package TYPO3
39 * @subpackage tx_indexedsearch
40 */
41 class Indexer {
42
43 // Messages:
44 /**
45 * @todo Define visibility
46 */
47 public $reasons = array(
48 -1 => 'mtime matched the document, so no changes detected and no content updated',
49 -2 => 'The minimum age was not exceeded',
50 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
51 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
52 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
53 4 => 'Page has never been indexed (is not represented in the index_phash table).'
54 );
55
56 // HTML code blocks to exclude from indexing:
57 /**
58 * @todo Define visibility
59 */
60 public $excludeSections = 'script,style';
61
62 // Supported Extensions for external files:
63 /**
64 * @todo Define visibility
65 */
66 public $external_parsers = array();
67
68 // External parser objects, keys are file extension names. Values are objects with certain methods.
69 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
70 /**
71 * @todo Define visibility
72 */
73 public $defaultGrList = '0,-1';
74
75 // Min/Max times:
76 /**
77 * @todo Define visibility
78 */
79 public $tstamp_maxAge = 0;
80
81 // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
82 /**
83 * @todo Define visibility
84 */
85 public $tstamp_minAge = 0;
86
87 // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
88 /**
89 * @todo Define visibility
90 */
91 public $maxExternalFiles = 0;
92
93 // Max number of external files to index.
94 /**
95 * @todo Define visibility
96 */
97 public $forceIndexing = FALSE;
98
99 // If TRUE, indexing is forced despite of hashes etc.
100 /**
101 * @todo Define visibility
102 */
103 public $crawlerActive = FALSE;
104
105 // Set when crawler is detected (internal)
106 // INTERNALS:
107 /**
108 * @todo Define visibility
109 */
110 public $defaultContentArray = array(
111 'title' => '',
112 'description' => '',
113 'keywords' => '',
114 'body' => ''
115 );
116
117 /**
118 * @todo Define visibility
119 */
120 public $wordcount = 0;
121
122 /**
123 * @todo Define visibility
124 */
125 public $externalFileCounter = 0;
126
127 /**
128 * @todo Define visibility
129 */
130 public $conf = array();
131
132 // Configuration set internally (see init functions for required keys and their meaning)
133 /**
134 * @todo Define visibility
135 */
136 public $indexerConfig = array();
137
138 // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
139 /**
140 * @todo Define visibility
141 */
142 public $hash = array();
143
144 // Hash array, contains phash and phash_grouping
145 /**
146 * @todo Define visibility
147 */
148 public $file_phash_arr = array();
149
150 // Hash array for files
151 /**
152 * @todo Define visibility
153 */
154 public $contentParts = array();
155
156 // Content of TYPO3 page
157 /**
158 * @todo Define visibility
159 */
160 public $content_md5h = '';
161
162 /**
163 * @todo Define visibility
164 */
165 public $internal_log = array();
166
167 // Internal log
168 /**
169 * @todo Define visibility
170 */
171 public $indexExternalUrl_content = '';
172
173 /**
174 * @todo Define visibility
175 */
176 public $cHashParams = array();
177
178 // cHashparams array
179 /**
180 * @todo Define visibility
181 */
182 public $freqRange = 32000;
183
184 /**
185 * @todo Define visibility
186 */
187 public $freqMax = 0.1;
188
189 /**
190 * @todo Define visibility
191 */
192 public $enableMetaphoneSearch = FALSE;
193
194 /**
195 * @todo Define visibility
196 */
197 public $storeMetaphoneInfoAsWords;
198
199 /**
200 * @todo Define visibility
201 */
202 public $metaphoneContent = '';
203
204 // Objects:
205 /**
206 * Charset class object
207 *
208 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
209 * @todo Define visibility
210 */
211 public $csObj;
212
213 /**
214 * Metaphone object, if any
215 *
216 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
217 * @todo Define visibility
218 */
219 public $metaphoneObj;
220
221 /**
222 * Lexer object for word splitting
223 *
224 * @var \TYPO3\CMS\IndexedSearch\Lexer
225 * @todo Define visibility
226 */
227 public $lexerObj;
228
229 /**
230 * @todo Define visibility
231 */
232 public $flagBitMask;
233
234 /**
235 * Parent Object (TSFE) Initialization
236 *
237 * @param object Parent Object (frontend TSFE object), passed by reference
238 * @return void
239 * @todo Define visibility
240 */
241 public function hook_indexContent(&$pObj) {
242 // Indexer configuration from Extension Manager interface:
243 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
244 // Crawler activation:
245 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
246 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
247 // Setting simple log message:
248 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
249 // Setting variables:
250 $this->crawlerActive = TRUE;
251 // Crawler active flag
252 $this->forceIndexing = TRUE;
253 }
254 // Determine if page should be indexed, and if so, configure and initialize indexer
255 if ($pObj->config['config']['index_enable']) {
256 $this->log_push('Index page', '');
257 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
258 if (!$pObj->page['no_search']) {
259 if (!$pObj->no_cache) {
260 if (!strcmp($pObj->sys_language_uid, $pObj->sys_language_content)) {
261 // Setting up internal configuration from config array:
262 $this->conf = array();
263 // Information about page for which the indexing takes place
264 $this->conf['id'] = $pObj->id;
265 // Page id
266 $this->conf['type'] = $pObj->type;
267 // Page type
268 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
269 // sys_language UID of the language of the indexing.
270 $this->conf['MP'] = $pObj->MP;
271 // MP variable, if any (Mount Points)
272 $this->conf['gr_list'] = $pObj->gr_list;
273 // Group list
274 $this->conf['cHash'] = $pObj->cHash;
275 // cHash string for additional parameters
276 $this->conf['cHash_array'] = $pObj->cHash_array;
277 // Array of the additional parameters
278 $this->conf['crdate'] = $pObj->page['crdate'];
279 // The creation date of the TYPO3 page
280 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
281 // reg1 of the caching table. Not known what practical use this has.
282 // Root line uids
283 $this->conf['rootline_uids'] = array();
284 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
285 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
286 }
287 // Content of page:
288 $this->conf['content'] = $pObj->content;
289 // Content string (HTML of TYPO3 page)
290 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
291 // Alternative title for indexing
292 $this->conf['metaCharset'] = $pObj->metaCharset;
293 // Character set of content (will be converted to utf-8 during indexing)
294 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];
295 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
296 // Configuration of behavior:
297 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
298 // Whether to index external documents like PDF, DOC etc. (if possible)
299 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
300 // Length of description text (max 250, default 200)
301 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
302 // Set to zero:
303 $this->conf['recordUid'] = 0;
304 $this->conf['freeIndexUid'] = 0;
305 $this->conf['freeIndexSetId'] = 0;
306 // Init and start indexing:
307 $this->init();
308 $this->indexTypo3PageContent();
309 } else {
310 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
311 }
312 } else {
313 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
314 }
315 } else {
316 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
317 }
318 } else {
319 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
320 }
321 $this->log_pull();
322 }
323 }
324
325 /****************************
326 *
327 * Backend API
328 *
329 ****************************/
330 /**
331 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
332 *
333 * @param integer The page uid, &id=
334 * @param integer The page type, &type=
335 * @param integer sys_language uid, typically &L=
336 * @param string The MP variable (Mount Points), &MP=
337 * @param array Rootline array of only UIDs.
338 * @param array Array of GET variables to register with this indexing
339 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
340 * @return void
341 * @todo Define visibility
342 */
343 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = FALSE) {
344 // Setting up internal configuration from config array:
345 $this->conf = array();
346 // Information about page for which the indexing takes place
347 $this->conf['id'] = $id;
348 // Page id (integer)
349 $this->conf['type'] = $type;
350 // Page type (integer)
351 $this->conf['sys_language_uid'] = $sys_language_uid;
352 // sys_language UID of the language of the indexing (integer)
353 $this->conf['MP'] = $MP;
354 // MP variable, if any (Mount Points) (string)
355 $this->conf['gr_list'] = '0,-1';
356 // Group list (hardcoded for now...)
357 // cHash values:
358 $this->conf['cHash'] = $createCHash ? \TYPO3\CMS\Core\Utility\GeneralUtility::generateCHash(\TYPO3\CMS\Core\Utility\GeneralUtility::implodeArrayForUrl('', $cHash_array)) : '';
359 // cHash string for additional parameters
360 $this->conf['cHash_array'] = $cHash_array;
361 // Array of the additional parameters
362 // Set to defaults
363 $this->conf['freeIndexUid'] = 0;
364 $this->conf['freeIndexSetId'] = 0;
365 $this->conf['page_cache_reg1'] = '';
366 // Root line uids
367 $this->conf['rootline_uids'] = $uidRL;
368 // Configuration of behavior:
369 $this->conf['index_externals'] = 1;
370 // Whether to index external documents like PDF, DOC etc. (if possible)
371 $this->conf['index_descrLgd'] = 200;
372 // Length of description text (max 250, default 200)
373 $this->conf['index_metatags'] = TRUE;
374 // Whether to index document keywords and description (if present)
375 // Init and start indexing:
376 $this->init();
377 }
378
379 /**
380 * Sets the free-index uid. Can be called right after backend_initIndexer()
381 *
382 * @param integer Free index UID
383 * @param integer Set id - an integer identifying the "set" of indexing operations.
384 * @return void
385 * @todo Define visibility
386 */
387 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0) {
388 $this->conf['freeIndexUid'] = $freeIndexUid;
389 $this->conf['freeIndexSetId'] = $freeIndexSetId;
390 }
391
392 /**
393 * Indexing records as the content of a TYPO3 page.
394 *
395 * @param string Title equivalent
396 * @param string Keywords equivalent
397 * @param string Description equivalent
398 * @param string The main content to index
399 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
400 * @param integer Last modification time, in seconds
401 * @param integer The creation date of the content, in seconds
402 * @param integer The record UID that the content comes from (for registration with the indexed rows)
403 * @return void
404 * @todo Define visibility
405 */
406 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0) {
407 // Content of page:
408 $this->conf['mtime'] = $mtime;
409 // Most recent modification time (seconds) of the content
410 $this->conf['crdate'] = $crdate;
411 // The creation date of the TYPO3 content
412 $this->conf['recordUid'] = $recordUid;
413 // UID of the record, if applicable
414 // Construct fake HTML for parsing:
415 $this->conf['content'] = '
416 <html>
417 <head>
418 <title>' . htmlspecialchars($title) . '</title>
419 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
420 <meta name="description" content="' . htmlspecialchars($description) . '" />
421 </head>
422 <body>
423 ' . htmlspecialchars($content) . '
424 </body>
425 </html>';
426 // Content string (HTML of TYPO3 page)
427 // Initializing charset:
428 $this->conf['metaCharset'] = $charset;
429 // Character set of content (will be converted to utf-8 during indexing)
430 $this->conf['indexedDocTitle'] = '';
431 // Alternative title for indexing
432 // Index content as if it was a TYPO3 page:
433 $this->indexTypo3PageContent();
434 }
435
436 /********************************
437 *
438 * Initialization
439 *
440 *******************************/
441 /**
442 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
443 *
444 * @return void
445 * @todo Define visibility
446 */
447 public function init() {
448 global $TYPO3_CONF_VARS;
449 // Initializing:
450 $this->cHashParams = $this->conf['cHash_array'];
451 if (is_array($this->cHashParams) && count($this->cHashParams)) {
452 if ($this->conf['cHash']) {
453 // Add this so that URL's come out right...
454 $this->cHashParams['cHash'] = $this->conf['cHash'];
455 }
456 unset($this->cHashParams['encryptionKey']);
457 }
458 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
459 $this->setT3Hashes();
460 // Indexer configuration from Extension Manager interface:
461 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
462 $this->tstamp_minAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
463 $this->tstamp_maxAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
464 $this->maxExternalFiles = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
465 $this->flagBitMask = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
466 // Workaround: If the extension configuration was not updated yet, the value is not existing
467 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
468 $this->storeMetaphoneInfoAsWords = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
469 // Initialize external document parsers:
470 // Example configuration, see ext_localconf.php of this file!
471 if ($this->conf['index_externals']) {
472 $this->initializeExternalParsers();
473 }
474 // Initialize lexer (class that deconstructs the text into words):
475 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
476 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ? $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] : 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
477 $this->lexerObj = \TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($lexerObjRef);
478 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
479 // Initialize metaphone hook:
480 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
481 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
482 if ($this->enableMetaphoneSearch && $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
483 $this->metaphoneObj = \TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
484 $this->metaphoneObj->pObj = $this;
485 }
486 // Init charset class:
487 $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Charset\\CharsetConverter');
488 }
489
490 /**
491 * Initialize external parsers
492 *
493 * @return void
494 * @access private
495 * @see init()
496 * @todo Define visibility
497 */
498 public function initializeExternalParsers() {
499 global $TYPO3_CONF_VARS;
500 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
501 foreach ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
502 $this->external_parsers[$extension] = \TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($_objRef);
503 $this->external_parsers[$extension]->pObj = $this;
504 // Init parser and if it returns FALSE, unset its entry again:
505 if (!$this->external_parsers[$extension]->initParser($extension)) {
506 unset($this->external_parsers[$extension]);
507 }
508 }
509 }
510 }
511
512 /********************************
513 *
514 * Indexing; TYPO3 pages (HTML content)
515 *
516 *******************************/
517 /**
518 * Start indexing of the TYPO3 page
519 *
520 * @return void
521 * @todo Define visibility
522 */
523 public function indexTypo3PageContent() {
524 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
525 $is_grlist = $this->is_grlist_set($this->hash['phash']);
526 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
527 // Setting message:
528 if ($this->forceIndexing) {
529 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
530 } elseif ($check > 0) {
531 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
532 } else {
533 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
534 }
535 // Divide into title,keywords,description and body:
536 $this->log_push('Split content', '');
537 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
538 if ($this->conf['indexedDocTitle']) {
539 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
540 }
541 $this->log_pull();
542 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
543 $this->content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
544 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
545 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
546 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
547 $checkCHash = $this->checkContentHash();
548 if (!is_array($checkCHash) || $check === 1) {
549 $Pstart = \TYPO3\CMS\Core\Utility\GeneralUtility::milliseconds();
550 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
551 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
552 $this->log_pull();
553 // Splitting words
554 $this->log_push('Extract words from content', '');
555 $splitInWords = $this->processWordsInArrays($this->contentParts);
556 $this->log_pull();
557 // Analyse the indexed words.
558 $this->log_push('Analyse the extracted words', '');
559 $indexArr = $this->indexAnalyze($splitInWords);
560 $this->log_pull();
561 // Submitting page (phash) record
562 $this->log_push('Submitting page', '');
563 $this->submitPage();
564 $this->log_pull();
565 // Check words and submit to word list if not there
566 $this->log_push('Check word list and submit words', '');
567 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
568 $this->checkWordList($indexArr);
569 $this->submitWords($indexArr, $this->hash['phash']);
570 }
571 $this->log_pull();
572 // Set parsetime
573 $this->updateParsetime($this->hash['phash'], \TYPO3\CMS\Core\Utility\GeneralUtility::milliseconds() - $Pstart);
574 // Checking external files if configured for.
575 $this->log_push('Checking external files', '');
576 if ($this->conf['index_externals']) {
577 $this->extractLinks($this->conf['content']);
578 }
579 $this->log_pull();
580 } else {
581 // Update the timestamp
582 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
583 $this->updateSetId($this->hash['phash']);
584 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
585 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
586 $this->updateRootline();
587 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
588 }
589 } else {
590 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
591 }
592 }
593
594 /**
595 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
596 *
597 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
598 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
599 * @see splitRegularContent()
600 * @todo Define visibility
601 */
602 public function splitHTMLContent($content) {
603 // divide head from body ( u-ouh :) )
604 $contentArr = $this->defaultContentArray;
605 $contentArr['body'] = stristr($content, '<body');
606 $headPart = substr($content, 0, -strlen($contentArr['body']));
607 // get title
608 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
609 $titleParts = explode(':', $contentArr['title'], 2);
610 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
611 // get keywords and description metatags
612 if ($this->conf['index_metatags']) {
613 $meta = array();
614 $i = 0;
615 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
616 $i++;
617 }
618 // TODO The code below stops at first unset tag. Is that correct?
619 for ($i = 0; isset($meta[$i]); $i++) {
620 $meta[$i] = \TYPO3\CMS\Core\Utility\GeneralUtility::get_tag_attributes($meta[$i]);
621 if (stristr($meta[$i]['name'], 'keywords')) {
622 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
623 }
624 if (stristr($meta[$i]['name'], 'description')) {
625 $contentArr['description'] .= ',' . $meta[$i]['content'];
626 }
627 }
628 }
629 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
630 $this->typoSearchTags($contentArr['body']);
631 // Get rid of unwanted sections (ie. scripting and style stuff) in body
632 $tagList = explode(',', $this->excludeSections);
633 foreach ($tagList as $tag) {
634 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
635
636 }
637 }
638 // remove tags, but first make sure we don't concatenate words by doing it
639 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
640 $contentArr['body'] = trim(strip_tags($contentArr['body']));
641 $contentArr['keywords'] = trim($contentArr['keywords']);
642 $contentArr['description'] = trim($contentArr['description']);
643 // Return array
644 return $contentArr;
645 }
646
647 /**
648 * Extract the charset value from HTML meta tag.
649 *
650 * @param string HTML content
651 * @return string The charset value if found.
652 * @todo Define visibility
653 */
654 public function getHTMLcharset($content) {
655 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
656 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
657 return $reg2[1];
658 }
659 }
660 }
661
662 /**
663 * Converts a HTML document to utf-8
664 *
665 * @param string HTML content, any charset
666 * @param string Optional charset (otherwise extracted from HTML)
667 * @return string Converted HTML
668 * @todo Define visibility
669 */
670 public function convertHTMLToUtf8($content, $charset = '') {
671 // Find charset:
672 $charset = $charset ? $charset : $this->getHTMLcharset($content);
673 $charset = $this->csObj->parse_charset($charset);
674 // Convert charset:
675 if ($charset && $charset !== 'utf-8') {
676 $content = $this->csObj->utf8_encode($content, $charset);
677 }
678 // Convert entities, assuming document is now UTF-8:
679 $content = $this->csObj->entities_to_utf8($content, TRUE);
680 return $content;
681 }
682
683 /**
684 * Finds first occurence of embracing tags and returns the embraced content and the original string with
685 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
686 * <title> of document or removing <script>-sections
687 *
688 * @param string String to search in
689 * @param string Tag name, eg. "script
690 * @param string Passed by reference: Content inside found tag
691 * @param string Passed by reference: Content after found tag
692 * @param string Passed by reference: Attributes of the found tag.
693 * @return boolean Returns FALSE if tag was not found, otherwise TRUE.
694 * @todo Define visibility
695 */
696 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
697 $endTag = '</' . $tagName . '>';
698 $startTag = '<' . $tagName;
699 // stristr used because we want a case-insensitive search for the tag.
700 $isTagInText = stristr($string, $startTag);
701 // if the tag was not found, return FALSE
702 if (!$isTagInText) {
703 return FALSE;
704 }
705 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
706 $afterTagInText = stristr($isTagInText, $endTag);
707 if ($afterTagInText) {
708 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
709 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
710 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
711 } else {
712 $tagContent = '';
713 $stringAfter = $isTagInText;
714 }
715 return TRUE;
716 }
717
718 /**
719 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
720 *
721 * @param string HTML Content, passed by reference
722 * @return boolean Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
723 * @todo Define visibility
724 */
725 public function typoSearchTags(&$body) {
726 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
727 if (count($expBody) > 1) {
728 $body = '';
729 foreach ($expBody as $val) {
730 $part = explode('-->', $val, 2);
731 if (trim($part[0]) == 'begin') {
732 $body .= $part[1];
733 $prev = '';
734 } elseif (trim($part[0]) == 'end') {
735 $body .= $prev;
736 } else {
737 $prev = $val;
738 }
739 }
740 return TRUE;
741 } else {
742 return FALSE;
743 }
744 }
745
746 /**
747 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
748 *
749 * @param string HTML content
750 * @return void
751 * @todo Define visibility
752 */
753 public function extractLinks($content) {
754 // Get links:
755 $list = $this->extractHyperLinks($content);
756 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
757 $this->includeCrawlerClass();
758 $crawler = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_lib');
759 }
760 // Traverse links:
761 foreach ($list as $linkInfo) {
762 // Decode entities:
763 if ($linkInfo['localPath']) {
764 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
765 $linkSource = \TYPO3\CMS\Core\Utility\GeneralUtility::htmlspecialchars_decode($linkInfo['localPath']);
766 } else {
767 $linkSource = \TYPO3\CMS\Core\Utility\GeneralUtility::htmlspecialchars_decode($linkInfo['href']);
768 }
769 // Parse URL:
770 $qParts = parse_url($linkSource);
771 // Check for jumpurl (TYPO3 specific thing...)
772 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
773 parse_str($qParts['query'], $getP);
774 $linkSource = $getP['jumpurl'];
775 $qParts = parse_url($linkSource);
776 }
777 if (!$linkInfo['localPath'] && $qParts['scheme']) {
778 if ($this->indexerConfig['indexExternalURLs']) {
779 // Index external URL (http or otherwise)
780 $this->indexExternalUrl($linkSource);
781 }
782 } elseif (!$qParts['query']) {
783 $linkSource = urldecode($linkSource);
784 if (\TYPO3\CMS\Core\Utility\GeneralUtility::isAllowedAbsPath($linkSource)) {
785 $localFile = $linkSource;
786 } else {
787 $localFile = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
788 }
789 if ($localFile && @is_file($localFile)) {
790 // Index local file:
791 if ($linkInfo['localPath']) {
792 $fI = pathinfo($linkSource);
793 $ext = strtolower($fI['extension']);
794 if (is_object($crawler)) {
795 $params = array(
796 'document' => $linkSource,
797 'alturl' => $linkInfo['href'],
798 'conf' => $this->conf
799 );
800 unset($params['conf']['content']);
801 $crawler->addQueueEntry_callBack(0, $params, 'EXT:indexed_search/class.crawler.php:&TYPO3\\CMS\\IndexedSearch\\Controller\\SearchFormController_files', $this->conf['id']);
802 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
803 } else {
804 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
805 }
806 } else {
807 if (is_object($crawler)) {
808 $params = array(
809 'document' => $linkSource,
810 'conf' => $this->conf
811 );
812 unset($params['conf']['content']);
813 $crawler->addQueueEntry_callBack(0, $params, 'EXT:indexed_search/class.crawler.php:&TYPO3\\CMS\\IndexedSearch\\Controller\\SearchFormController_files', $this->conf['id']);
814 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
815 } else {
816 $this->indexRegularDocument($linkSource);
817 }
818 }
819 }
820 }
821 }
822 }
823
824 /**
825 * Extracts all links to external documents from the HTML content string
826 *
827 * @param string $html
828 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
829 * @see extractLinks()
830 * @todo Define visibility
831 */
832 public function extractHyperLinks($html) {
833 $htmlParser = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('t3lib_parseHtml');
834 $htmlParts = $htmlParser->splitTags('a', $html);
835 $hyperLinksData = array();
836 foreach ($htmlParts as $index => $tagData) {
837 if ($index % 2 !== 0) {
838 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
839 $firstTagName = $htmlParser->getFirstTagName($tagData);
840 if (strtolower($firstTagName) == 'a') {
841 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
842 $hyperLinksData[] = array(
843 'tag' => $tagData,
844 'href' => $tagAttributes[0]['href'],
845 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
846 );
847 }
848 }
849 }
850 }
851 return $hyperLinksData;
852 }
853
854 /**
855 * Extracts the "base href" from content string.
856 *
857 * @param string Content to analyze
858 * @return string The base href or an empty string if not found
859 */
860 public function extractBaseHref($html) {
861 $href = '';
862 $htmlParser = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('t3lib_parseHtml');
863 $htmlParts = $htmlParser->splitTags('base', $html);
864 foreach ($htmlParts as $index => $tagData) {
865 if ($index % 2 !== 0) {
866 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
867 $firstTagName = $htmlParser->getFirstTagName($tagData);
868 if (strtolower($firstTagName) == 'base') {
869 $href = $tagAttributes[0]['href'];
870 if ($href) {
871 break;
872 }
873 }
874 }
875 }
876 return $href;
877 }
878
879 /******************************************
880 *
881 * Indexing; external URL
882 *
883 ******************************************/
884 /**
885 * Index External URLs HTML content
886 *
887 * @param string URL, eg. "http://typo3.org/
888 * @return void
889 * @see indexRegularDocument()
890 * @todo Define visibility
891 */
892 public function indexExternalUrl($externalUrl) {
893 // Parse External URL:
894 $qParts = parse_url($externalUrl);
895 $fI = pathinfo($qParts['path']);
896 $ext = strtolower($fI['extension']);
897 // Get headers:
898 $urlHeaders = $this->getUrlHeaders($externalUrl);
899 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
900 $content = ($this->indexExternalUrl_content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($externalUrl));
901 if (strlen($content)) {
902 // Create temporary file:
903 $tmpFile = \TYPO3\CMS\Core\Utility\GeneralUtility::tempnam('EXTERNAL_URL');
904 if ($tmpFile) {
905 \TYPO3\CMS\Core\Utility\GeneralUtility::writeFile($tmpFile, $content);
906 // Index that file:
907 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
908 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
909 unlink($tmpFile);
910 }
911 }
912 }
913 }
914
915 /**
916 * Getting HTTP request headers of URL
917 *
918 * @param string The URL
919 * @param integer Timeout (seconds?)
920 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
921 * @todo Define visibility
922 */
923 public function getUrlHeaders($url) {
924 // Try to get the headers only
925 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($url, 2);
926 if (strlen($content)) {
927 // Compile headers:
928 $headers = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(LF, $content, 1);
929 $retVal = array();
930 foreach ($headers as $line) {
931 if (!strlen(trim($line))) {
932 break;
933 }
934 list($headKey, $headValue) = explode(':', $line, 2);
935 $retVal[$headKey] = $headValue;
936 }
937 return $retVal;
938 }
939 }
940
941 /**
942 * Checks if the file is local
943 *
944 * @param $sourcePath
945 * @return string Absolute path to file if file is local, else empty string
946 */
947 protected function createLocalPath($sourcePath) {
948 $localPath = '';
949 static $pathFunctions = array(
950 'createLocalPathFromT3vars',
951 'createLocalPathUsingAbsRefPrefix',
952 'createLocalPathUsingDomainURL',
953 'createLocalPathFromAbsoluteURL',
954 'createLocalPathFromRelativeURL'
955 );
956 foreach ($pathFunctions as $functionName) {
957 $localPath = $this->{$functionName}($sourcePath);
958 if ($localPath != '') {
959 break;
960 }
961 }
962 return $localPath;
963 }
964
965 /**
966 * Attempts to create a local file path from T3VARs. This is useful for
967 * various download extensions that hide actual file name but still want the
968 * file to be indexed.
969 *
970 * @param string $sourcePath
971 * @return string
972 */
973 protected function createLocalPathFromT3vars($sourcePath) {
974 $localPath = '';
975 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
976 if (is_array($indexLocalFiles)) {
977 $md5 = \TYPO3\CMS\Core\Utility\GeneralUtility::shortMD5($sourcePath);
978 // Note: not using self::isAllowedLocalFile here because this method
979 // is allowed to index files outside of the web site (for example,
980 // protected downloads)
981 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
982 $localPath = $indexLocalFiles[$md5];
983 }
984 }
985 return $localPath;
986 }
987
988 /**
989 * Attempts to create a local file path by matching a current request URL.
990 *
991 * @param string $sourcePath
992 * @return string
993 */
994 protected function createLocalPathUsingDomainURL($sourcePath) {
995 $localPath = '';
996 $baseURL = \TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
997 $baseURLLength = strlen($baseURL);
998 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
999 $sourcePath = substr($sourcePath, $baseURLLength);
1000 $localPath = PATH_site . $sourcePath;
1001 if (!self::isAllowedLocalFile($localPath)) {
1002 $localPath = '';
1003 }
1004 }
1005 return $localPath;
1006 }
1007
1008 /**
1009 * Attempts to create a local file path by matching absRefPrefix. This
1010 * requires TSFE. If TSFE is missing, this function does nothing.
1011 *
1012 * @param string $sourcePath
1013 * @return string
1014 */
1015 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
1016 $localPath = '';
1017 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1018 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1019 $absRefPrefixLength = strlen($absRefPrefix);
1020 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1021 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1022 $localPath = PATH_site . $sourcePath;
1023 if (!self::isAllowedLocalFile($localPath)) {
1024 $localPath = '';
1025 }
1026 }
1027 }
1028 return $localPath;
1029 }
1030
1031 /**
1032 * Attempts to create a local file path from the absolute URL without
1033 * schema.
1034 *
1035 * @param string $sourcePath
1036 * @return string
1037 */
1038 protected function createLocalPathFromAbsoluteURL($sourcePath) {
1039 $localPath = '';
1040 if ($sourcePath[0] == '/') {
1041 $sourcePath = substr($sourcePath, 1);
1042 $localPath = PATH_site . $sourcePath;
1043 if (!self::isAllowedLocalFile($localPath)) {
1044 $localPath = '';
1045 }
1046 }
1047 return $localPath;
1048 }
1049
1050 /**
1051 * Attempts to create a local file path from the relative URL.
1052 *
1053 * @param string $sourcePath
1054 * @return string
1055 */
1056 protected function createLocalPathFromRelativeURL($sourcePath) {
1057 $localPath = '';
1058 if (self::isRelativeURL($sourcePath)) {
1059 $localPath = PATH_site . $sourcePath;
1060 if (!self::isAllowedLocalFile($localPath)) {
1061 $localPath = '';
1062 }
1063 }
1064 return $localPath;
1065 }
1066
1067 /**
1068 * Checks if URL is relative.
1069 *
1070 * @param string $url
1071 * @return boolean
1072 */
1073 static protected function isRelativeURL($url) {
1074 $urlParts = @parse_url($url);
1075 return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1076 }
1077
1078 /**
1079 * Checks if the path points to the file inside the web site
1080 *
1081 * @param string $filePath
1082 * @return boolean
1083 */
1084 static protected function isAllowedLocalFile($filePath) {
1085 $filePath = \TYPO3\CMS\Core\Utility\GeneralUtility::resolveBackPath($filePath);
1086 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1087 $isFile = is_file($filePath);
1088 return $insideWebPath && $isFile;
1089 }
1090
1091 /******************************************
1092 *
1093 * Indexing; external files (PDF, DOC, etc)
1094 *
1095 ******************************************/
1096 /**
1097 * Indexing a regular document given as $file (relative to PATH_site, local file)
1098 *
1099 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1100 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1101 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1102 * @param string File extension for temporary file.
1103 * @return void
1104 * @todo Define visibility
1105 */
1106 public function indexRegularDocument($file, $force = FALSE, $contentTmpFile = '', $altExtension = '') {
1107 // Init
1108 $fI = pathinfo($file);
1109 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
1110 // Create abs-path:
1111 if (!$contentTmpFile) {
1112 if (!\TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath($file)) {
1113 // Relative, prepend PATH_site:
1114 $absFile = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName(PATH_site . $file);
1115 } else {
1116 // Absolute, pass-through:
1117 $absFile = $file;
1118 }
1119 $absFile = \TYPO3\CMS\Core\Utility\GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1120 } else {
1121 $absFile = $contentTmpFile;
1122 }
1123 // Indexing the document:
1124 if ($absFile && @is_file($absFile)) {
1125 if ($this->external_parsers[$ext]) {
1126 $mtime = filemtime($absFile);
1127 $cParts = $this->fileContentParts($ext, $absFile);
1128 foreach ($cParts as $cPKey) {
1129 $this->internal_log = array();
1130 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1131 $Pstart = \TYPO3\CMS\Core\Utility\GeneralUtility::milliseconds();
1132 $subinfo = array('key' => $cPKey);
1133 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1134 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1135 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1136 if ($check > 0 || $force) {
1137 if ($check > 0) {
1138 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1139 } else {
1140 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1141 }
1142 // Check external file counter:
1143 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1144 // Divide into title,keywords,description and body:
1145 $this->log_push('Split content', '');
1146 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1147 $this->log_pull();
1148 if (is_array($contentParts)) {
1149 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1150 $content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1151 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1152 // Increment counter:
1153 $this->externalFileCounter++;
1154 // Splitting words
1155 $this->log_push('Extract words from content', '');
1156 $splitInWords = $this->processWordsInArrays($contentParts);
1157 $this->log_pull();
1158 // Analyse the indexed words.
1159 $this->log_push('Analyse the extracted words', '');
1160 $indexArr = $this->indexAnalyze($splitInWords);
1161 $this->log_pull();
1162 // Submitting page (phash) record
1163 $this->log_push('Submitting page', '');
1164 $size = filesize($absFile);
1165 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1166 $ctime = filemtime($absFile);
1167 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts);
1168 $this->log_pull();
1169 // Check words and submit to word list if not there
1170 $this->log_push('Check word list and submit words', '');
1171 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1172 $this->checkWordList($indexArr);
1173 $this->submitWords($indexArr, $phash_arr['phash']);
1174 }
1175 $this->log_pull();
1176 // Set parsetime
1177 $this->updateParsetime($phash_arr['phash'], \TYPO3\CMS\Core\Utility\GeneralUtility::milliseconds() - $Pstart);
1178 } else {
1179 // Update the timestamp
1180 $this->updateTstamp($phash_arr['phash'], $mtime);
1181 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1182 }
1183 } else {
1184 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1185 }
1186 } else {
1187 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1188 }
1189 } else {
1190 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1191 }
1192 // Checking and setting sections:
1193 $this->submitFile_section($phash_arr['phash']);
1194 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1195 $this->log_pull();
1196 }
1197 } else {
1198 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1199 }
1200 } else {
1201 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1202 }
1203 }
1204
1205 /**
1206 * Reads the content of an external file being indexed.
1207 * The content from the external parser MUST be returned in utf-8!
1208 *
1209 * @param string File extension, eg. "pdf", "doc" etc.
1210 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1211 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1212 * @return array Standard content array (title, description, keywords, body keys)
1213 * @todo Define visibility
1214 */
1215 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1216 $contentArray = NULL;
1217 // Consult relevant external document parser:
1218 if (is_object($this->external_parsers[$fileExtension])) {
1219 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1220 }
1221 return $contentArray;
1222 }
1223
1224 /**
1225 * Creates an array with pointers to divisions of document.
1226 *
1227 * @param string File extension
1228 * @param string Absolute filename (must exist and be validated OK before calling function)
1229 * @return array Array of pointers to sections that the document should be divided into
1230 * @todo Define visibility
1231 */
1232 public function fileContentParts($ext, $absFile) {
1233 $cParts = array(0);
1234 // Consult relevant external document parser:
1235 if (is_object($this->external_parsers[$ext])) {
1236 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1237 }
1238 return $cParts;
1239 }
1240
1241 /**
1242 * Splits non-HTML content (from external files for instance)
1243 *
1244 * @param string Input content (non-HTML) to index.
1245 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1246 * @see splitHTMLContent()
1247 * @todo Define visibility
1248 */
1249 public function splitRegularContent($content) {
1250 $contentArr = $this->defaultContentArray;
1251 $contentArr['body'] = $content;
1252 return $contentArr;
1253 }
1254
1255 /**********************************
1256 *
1257 * Analysing content, Extracting words
1258 *
1259 **********************************/
1260 /**
1261 * Convert character set and HTML entities in the value of input content array keys
1262 *
1263 * @param array Standard content array
1264 * @param string Charset of the input content (converted to utf-8)
1265 * @return void
1266 * @todo Define visibility
1267 */
1268 public function charsetEntity2utf8(&$contentArr, $charset) {
1269 // Convert charset if necessary
1270 foreach ($contentArr as $key => $value) {
1271 if (strlen($contentArr[$key])) {
1272 if ($charset !== 'utf-8') {
1273 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1274 }
1275 // decode all numeric / html-entities in the string to real characters:
1276 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1277 }
1278 }
1279 }
1280
1281 /**
1282 * Processing words in the array from split*Content -functions
1283 *
1284 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1285 * @return array Content input array modified so each key is not a unique array of words
1286 * @todo Define visibility
1287 */
1288 public function processWordsInArrays($contentArr) {
1289 // split all parts to words
1290 foreach ($contentArr as $key => $value) {
1291 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1292 }
1293 // For title, keywords, and description we don't want duplicates:
1294 $contentArr['title'] = array_unique($contentArr['title']);
1295 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1296 $contentArr['description'] = array_unique($contentArr['description']);
1297 // Return modified array:
1298 return $contentArr;
1299 }
1300
1301 /**
1302 * Extracts the sample description text from the content array.
1303 *
1304 * @param array Content array
1305 * @return string Description string
1306 * @todo Define visibility
1307 */
1308 public function bodyDescription($contentArr) {
1309 // Setting description
1310 $maxL = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1311 if ($maxL) {
1312 $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1313 // Shorten the string:
1314 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1315 }
1316 return $bodyDescription;
1317 }
1318
1319 /**
1320 * Analyzes content to use for indexing,
1321 *
1322 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1323 * @return array Index Array (whatever that is...)
1324 * @todo Define visibility
1325 */
1326 public function indexAnalyze($content) {
1327 $indexArr = array();
1328 $counter = 0;
1329 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1330 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1331 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1332 $this->analyzeBody($indexArr, $content);
1333 return $indexArr;
1334 }
1335
1336 /**
1337 * Calculates relevant information for headercontent
1338 *
1339 * @param array Index array, passed by reference
1340 * @param array Standard content array
1341 * @param string Key from standard content array
1342 * @param integer Bit-wise priority to type
1343 * @return void
1344 * @todo Define visibility
1345 */
1346 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1347 foreach ($content[$key] as $val) {
1348 $val = substr($val, 0, 60);
1349 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1350 if (!isset($retArr[$val])) {
1351 // Word ID (wid)
1352 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1353 // Metaphone value is also 60 only chars long
1354 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1355 $retArr[$val]['metaphone'] = $metaphone;
1356 }
1357 // Build metaphone fulltext string (can be used for fulltext indexing)
1358 if ($this->storeMetaphoneInfoAsWords) {
1359 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1360 }
1361 // Priority used for flagBitMask feature (see extension configuration)
1362 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1363 // Increase number of occurences
1364 $retArr[$val]['count']++;
1365 $this->wordcount++;
1366 }
1367 }
1368
1369 /**
1370 * Calculates relevant information for bodycontent
1371 *
1372 * @param array Index array, passed by reference
1373 * @param array Standard content array
1374 * @return void
1375 * @todo Define visibility
1376 */
1377 public function analyzeBody(&$retArr, $content) {
1378 foreach ($content['body'] as $key => $val) {
1379 $val = substr($val, 0, 60);
1380 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1381 if (!isset($retArr[$val])) {
1382 // First occurence (used for ranking results)
1383 $retArr[$val]['first'] = $key;
1384 // Word ID (wid)
1385 $retArr[$val]['hash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($val);
1386 // Metaphone value is also only 60 chars long
1387 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1388 $retArr[$val]['metaphone'] = $metaphone;
1389 }
1390 // Build metaphone fulltext string (can be used for fulltext indexing)
1391 if ($this->storeMetaphoneInfoAsWords) {
1392 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1393 }
1394 // Increase number of occurences
1395 $retArr[$val]['count']++;
1396 $this->wordcount++;
1397 }
1398 }
1399
1400 /**
1401 * Creating metaphone based hash from input word
1402 *
1403 * @param string Word to convert
1404 * @param boolean If set, returns the raw metaphone value (not hashed)
1405 * @return mixed Metaphone hash integer (or raw value, string)
1406 * @todo Define visibility
1407 */
1408 public function metaphone($word, $returnRawMetaphoneValue = FALSE) {
1409 if (is_object($this->metaphoneObj)) {
1410 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1411 } else {
1412 // Use native PHP function instead of advanced doubleMetaphone class
1413 $metaphoneRawValue = metaphone($word);
1414 }
1415 if ($returnRawMetaphoneValue) {
1416 $result = $metaphoneRawValue;
1417 } elseif (strlen($metaphoneRawValue)) {
1418 // Create hash and return integer
1419 $result = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($metaphoneRawValue);
1420 } else {
1421 $result = 0;
1422 }
1423 return $result;
1424 }
1425
1426 /********************************
1427 *
1428 * SQL; TYPO3 Pages
1429 *
1430 *******************************/
1431 /**
1432 * Updates db with information about the page (TYPO3 page, not external media)
1433 *
1434 * @return void
1435 * @todo Define visibility
1436 */
1437 public function submitPage() {
1438 // Remove any current data for this phash:
1439 $this->removeOldIndexedPages($this->hash['phash']);
1440 // setting new phash_row
1441 $fields = array(
1442 'phash' => $this->hash['phash'],
1443 'phash_grouping' => $this->hash['phash_grouping'],
1444 'cHashParams' => serialize($this->cHashParams),
1445 'contentHash' => $this->content_md5h,
1446 'data_page_id' => $this->conf['id'],
1447 'data_page_reg1' => $this->conf['page_cache_reg1'],
1448 'data_page_type' => $this->conf['type'],
1449 'data_page_mp' => $this->conf['MP'],
1450 'gr_list' => $this->conf['gr_list'],
1451 'item_type' => 0,
1452 // TYPO3 page
1453 'item_title' => $this->contentParts['title'],
1454 'item_description' => $this->bodyDescription($this->contentParts),
1455 'item_mtime' => $this->conf['mtime'],
1456 'item_size' => strlen($this->conf['content']),
1457 'tstamp' => $GLOBALS['EXEC_TIME'],
1458 'crdate' => $GLOBALS['EXEC_TIME'],
1459 'item_crdate' => $this->conf['crdate'],
1460 // Creation date of page
1461 'sys_language_uid' => $this->conf['sys_language_uid'],
1462 // Sys language uid of the page. Should reflect which language it DOES actually display!
1463 'externalUrl' => 0,
1464 'recordUid' => intval($this->conf['recordUid']),
1465 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1466 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1467 );
1468 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1469 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1470 }
1471 // PROCESSING index_section
1472 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1473 // PROCESSING index_grlist
1474 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1475 // PROCESSING index_fulltext
1476 $fields = array(
1477 'phash' => $this->hash['phash'],
1478 'fulltextdata' => implode(' ', $this->contentParts),
1479 'metaphonedata' => $this->metaphoneContent
1480 );
1481 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1482 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1483 }
1484 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1485 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1486 }
1487 // PROCESSING index_debug
1488 if ($this->indexerConfig['debugMode']) {
1489 $fields = array(
1490 'phash' => $this->hash['phash'],
1491 'debuginfo' => serialize(array(
1492 'cHashParams' => $this->cHashParams,
1493 'external_parsers initialized' => array_keys($this->external_parsers),
1494 'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1495 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1496 'logs' => $this->internal_log,
1497 'lexer' => $this->lexerObj->debugString
1498 ))
1499 );
1500 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1501 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1502 }
1503 }
1504 }
1505
1506 /**
1507 * Stores gr_list in the database.
1508 *
1509 * @param integer Search result record phash
1510 * @param integer Actual phash of current content
1511 * @return void
1512 * @see update_grlist()
1513 * @todo Define visibility
1514 */
1515 public function submit_grlist($hash, $phash_x) {
1516 // Setting the gr_list record
1517 $fields = array(
1518 'phash' => $hash,
1519 'phash_x' => $phash_x,
1520 'hash_gr_list' => \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1521 'gr_list' => $this->conf['gr_list']
1522 );
1523 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1524 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1525 }
1526 }
1527
1528 /**
1529 * Stores section
1530 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1531 *
1532 * @param integer phash of TYPO3 parent search result record
1533 * @param integer phash of the file indexation search record
1534 * @return void
1535 * @todo Define visibility
1536 */
1537 public function submit_section($hash, $hash_t3) {
1538 $fields = array(
1539 'phash' => $hash,
1540 'phash_t3' => $hash_t3,
1541 'page_id' => intval($this->conf['id'])
1542 );
1543 $this->getRootLineFields($fields);
1544 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1545 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1546 }
1547 }
1548
1549 /**
1550 * Removes records for the indexed page, $phash
1551 *
1552 * @param integer phash value to flush
1553 * @return void
1554 * @todo Define visibility
1555 */
1556 public function removeOldIndexedPages($phash) {
1557 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1558 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1559 foreach ($tableArray as $table) {
1560 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1561 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1562 }
1563 }
1564 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1565 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1566 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . intval($phash));
1567 }
1568 }
1569
1570 /********************************
1571 *
1572 * SQL; External media
1573 *
1574 *******************************/
1575 /**
1576 * Updates db with information about the file
1577 *
1578 * @param array Array with phash and phash_grouping keys for file
1579 * @param string File name
1580 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1581 * @param string File extension determining the type of media.
1582 * @param integer Modification time of file.
1583 * @param integer Creation time of file.
1584 * @param integer Size of file in bytes
1585 * @param integer Content HASH value.
1586 * @param array Standard content array (using only title and body for a file)
1587 * @return void
1588 * @todo Define visibility
1589 */
1590 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1591 // Find item Type:
1592 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1593 $storeItemType = $storeItemType ? $storeItemType : $ext;
1594 // Remove any current data for this phash:
1595 $this->removeOldIndexedFiles($hash['phash']);
1596 // Split filename:
1597 $fileParts = parse_url($file);
1598 // Setting new
1599 $fields = array(
1600 'phash' => $hash['phash'],
1601 'phash_grouping' => $hash['phash_grouping'],
1602 'cHashParams' => serialize($subinfo),
1603 'contentHash' => $content_md5h,
1604 'data_filename' => $file,
1605 'item_type' => $storeItemType,
1606 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1607 'item_description' => $this->bodyDescription($contentParts),
1608 'item_mtime' => $mtime,
1609 'item_size' => $size,
1610 'item_crdate' => $ctime,
1611 'tstamp' => $GLOBALS['EXEC_TIME'],
1612 'crdate' => $GLOBALS['EXEC_TIME'],
1613 'gr_list' => $this->conf['gr_list'],
1614 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1615 'recordUid' => intval($this->conf['recordUid']),
1616 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1617 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1618 );
1619 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1620 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1621 }
1622 // PROCESSING index_fulltext
1623 $fields = array(
1624 'phash' => $hash['phash'],
1625 'fulltextdata' => implode(' ', $contentParts),
1626 'metaphonedata' => $this->metaphoneContent
1627 );
1628 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1629 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1630 }
1631 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1632 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1633 }
1634 // PROCESSING index_debug
1635 if ($this->indexerConfig['debugMode']) {
1636 $fields = array(
1637 'phash' => $hash['phash'],
1638 'debuginfo' => serialize(array(
1639 'cHashParams' => $subinfo,
1640 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1641 'logs' => $this->internal_log,
1642 'lexer' => $this->lexerObj->debugString
1643 ))
1644 );
1645 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1646 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1647 }
1648 }
1649 }
1650
1651 /**
1652 * Stores file gr_list for a file IF it does not exist already
1653 *
1654 * @param integer phash value of file
1655 * @return void
1656 * @todo Define visibility
1657 */
1658 public function submitFile_grlist($hash) {
1659 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1660 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1661 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($hash) . ' AND (hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1662 if ($count == 0) {
1663 $this->submit_grlist($hash, $hash);
1664 }
1665 }
1666 }
1667
1668 /**
1669 * Stores file section for a file IF it does not exist
1670 *
1671 * @param integer phash value of file
1672 * @return void
1673 * @todo Define visibility
1674 */
1675 public function submitFile_section($hash) {
1676 // Testing if there is already a section
1677 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1678 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . intval($hash) . ' AND page_id=' . intval($this->conf['id']));
1679 if ($count == 0) {
1680 $this->submit_section($hash, $this->hash['phash']);
1681 }
1682 }
1683 }
1684
1685 /**
1686 * Removes records for the indexed page, $phash
1687 *
1688 * @param integer phash value to flush
1689 * @return void
1690 * @todo Define visibility
1691 */
1692 public function removeOldIndexedFiles($phash) {
1693 // Removing old registrations for tables.
1694 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1695 foreach ($tableArray as $table) {
1696 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1697 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1698 }
1699 }
1700 }
1701
1702 /********************************
1703 *
1704 * SQL Helper functions
1705 *
1706 *******************************/
1707 /**
1708 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1709 * Return positive integer if the page needs to be indexed
1710 *
1711 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1712 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1713 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1714 * @todo Define visibility
1715 */
1716 public function checkMtimeTstamp($mtime, $phash) {
1717 if (!\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1718 // Not indexed (not in index_phash)
1719 $result = 4;
1720 } else {
1721 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . intval($phash));
1722 // If there was an indexing of the page...:
1723 if ($row) {
1724 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1725 // If max age is exceeded, index the page
1726 // The configured max-age was exceeded for the document and thus it's indexed.
1727 $result = 1;
1728 } else {
1729 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1730 // if minAge is not set or if minAge is exceeded, consider at mtime
1731 if ($mtime) {
1732 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1733 if ($row['item_mtime'] != $mtime) {
1734 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1735 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1736 $result = 2;
1737 } else {
1738 // mtime matched the document, so no changes detected and no content updated
1739 $result = -1;
1740 if ($this->tstamp_maxAge) {
1741 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1742 } else {
1743 $this->updateTstamp($phash);
1744 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1745 }
1746 }
1747 } else {
1748 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1749 $result = 3;
1750 }
1751 } else {
1752 // The minimum age was not exceeded
1753 $result = -2;
1754 }
1755 }
1756 } else {
1757 // Page has never been indexed (is not represented in the index_phash table).
1758 $result = 4;
1759 }
1760 }
1761 return $result;
1762 }
1763
1764 /**
1765 * Check content hash in phash table
1766 *
1767 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1768 * @todo Define visibility
1769 */
1770 public function checkContentHash() {
1771 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1772 $result = TRUE;
1773 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1774 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . intval($this->hash['phash_grouping']) . ' AND contentHash=' . intval($this->content_md5h));
1775 if ($row) {
1776 $result = $row;
1777 }
1778 }
1779 return $result;
1780 }
1781
1782 /**
1783 * Check content hash for external documents
1784 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1785 *
1786 * @param integer phash value to check (phash_grouping)
1787 * @param integer Content hash to check
1788 * @return boolean Returns TRUE if the document needs to be indexed (that is, there was no result)
1789 * @todo Define visibility
1790 */
1791 public function checkExternalDocContentHash($hashGr, $content_md5h) {
1792 $result = TRUE;
1793 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1794 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . intval($hashGr) . ' AND contentHash=' . intval($content_md5h));
1795 $result = $count == 0;
1796 }
1797 return $result;
1798 }
1799
1800 /**
1801 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1802 *
1803 * @param integer Phash integer to test.
1804 * @return boolean
1805 * @todo Define visibility
1806 */
1807 public function is_grlist_set($phash_x) {
1808 $result = FALSE;
1809 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1810 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . intval($phash_x));
1811 $result = $count > 0;
1812 }
1813 return $result;
1814 }
1815
1816 /**
1817 * Check if an grlist-entry for this hash exists and if not so, write one.
1818 *
1819 * @param integer phash of the search result that should be found
1820 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1821 * @return void
1822 * @see submit_grlist()
1823 * @todo Define visibility
1824 */
1825 public function update_grlist($phash, $phash_x) {
1826 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1827 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($phash) . ' AND hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1828 if ($count == 0) {
1829 $this->submit_grlist($phash, $phash_x);
1830 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1831 }
1832 }
1833 }
1834
1835 /**
1836 * Update tstamp for a phash row.
1837 *
1838 * @param integer phash value
1839 * @param integer If set, update the mtime field to this value.
1840 * @return void
1841 * @todo Define visibility
1842 */
1843 public function updateTstamp($phash, $mtime = 0) {
1844 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1845 $updateFields = array(
1846 'tstamp' => $GLOBALS['EXEC_TIME']
1847 );
1848 if ($mtime) {
1849 $updateFields['item_mtime'] = intval($mtime);
1850 }
1851 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1852 }
1853 }
1854
1855 /**
1856 * Update SetID of the index_phash record.
1857 *
1858 * @param integer phash value
1859 * @return void
1860 * @todo Define visibility
1861 */
1862 public function updateSetId($phash) {
1863 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1864 $updateFields = array(
1865 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1866 );
1867 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1868 }
1869 }
1870
1871 /**
1872 * Update parsetime for phash row.
1873 *
1874 * @param integer phash value.
1875 * @param integer Parsetime value to set.
1876 * @return void
1877 * @todo Define visibility
1878 */
1879 public function updateParsetime($phash, $parsetime) {
1880 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1881 $updateFields = array(
1882 'parsetime' => intval($parsetime)
1883 );
1884 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1885 }
1886 }
1887
1888 /**
1889 * Update section rootline for the page
1890 *
1891 * @return void
1892 * @todo Define visibility
1893 */
1894 public function updateRootline() {
1895 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1896 $updateFields = array();
1897 $this->getRootLineFields($updateFields);
1898 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . intval($this->conf['id']), $updateFields);
1899 }
1900 }
1901
1902 /**
1903 * Adding values for root-line fields.
1904 * rl0, rl1 and rl2 are standard. A hook might add more.
1905 *
1906 * @param array Field array, passed by reference
1907 * @return void
1908 * @todo Define visibility
1909 */
1910 public function getRootLineFields(array &$fieldArray) {
1911 $fieldArray['rl0'] = intval($this->conf['rootline_uids'][0]);
1912 $fieldArray['rl1'] = intval($this->conf['rootline_uids'][1]);
1913 $fieldArray['rl2'] = intval($this->conf['rootline_uids'][2]);
1914 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1915 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1916 $fieldArray[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1917 }
1918 }
1919 }
1920
1921 /**
1922 * Removes any indexed pages with userlogins which has the same contentHash
1923 * NOT USED anywhere inside this class!
1924 *
1925 * @return void
1926 * @todo Define visibility
1927 */
1928 public function removeLoginpagesWithContentHash() {
1929 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash') && \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1930 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1931 A.phash=B.phash
1932 AND A.phash_grouping=' . intval($this->hash['phash_grouping']) . '
1933 AND B.hash_gr_list<>' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . '
1934 AND A.contentHash=' . intval($this->content_md5h));
1935 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1936 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1937 $this->removeOldIndexedPages($row['phash']);
1938 }
1939 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1940 }
1941 }
1942
1943 /**
1944 * Includes the crawler class
1945 *
1946 * @return void
1947 * @todo Define visibility
1948 */
1949 public function includeCrawlerClass() {
1950 \TYPO3\CMS\Core\Utility\GeneralUtility::requireOnce(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php');
1951 }
1952
1953 /********************************
1954 *
1955 * SQL; Submitting words
1956 *
1957 *******************************/
1958 /**
1959 * Adds new words to db
1960 *
1961 * @param array $wordListArray Word List array (where each word has information about position etc).
1962 * @return void
1963 * @todo Define visibility
1964 */
1965 public function checkWordList($wordListArray) {
1966 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1967 if (count($wordListArray)) {
1968 $phashArray = array();
1969 foreach ($wordListArray as $value) {
1970 $phashArray[] = intval($value['hash']);
1971 }
1972 $cwl = implode(',', $phashArray);
1973 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1974 if ($count != count($wordListArray)) {
1975 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1976 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
1977 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1978 unset($wordListArray[$row['baseword']]);
1979 }
1980 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1981 foreach ($wordListArray as $key => $val) {
1982 $insertFields = array(
1983 'wid' => $val['hash'],
1984 'baseword' => $key,
1985 'metaphone' => $val['metaphone']
1986 );
1987 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1988 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1989 }
1990 }
1991 }
1992 }
1993 }
1994
1995 /**
1996 * Submits RELATIONS between words and phash
1997 *
1998 * @param array Word list array
1999 * @param integer phash value
2000 * @return void
2001 * @todo Define visibility
2002 */
2003 public function submitWords($wordList, $phash) {
2004 if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_rel')) {
2005 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . intval($phash));
2006 foreach ($wordList as $val) {
2007 $insertFields = array(
2008 'phash' => $phash,
2009 'wid' => $val['hash'],
2010 'count' => $val['count'],
2011 'first' => $val['first'],
2012 'freq' => $this->freqMap($val['count'] / $this->wordcount),
2013 'flags' => $val['cmp'] & $this->flagBitMask
2014 );
2015 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2016 }
2017 }
2018 }
2019
2020 /**
2021 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2022 * and back.
2023 *
2024 * @param double Frequency
2025 * @return integer Frequency in range.
2026 * @todo Define visibility
2027 */
2028 public function freqMap($freq) {
2029 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2030 if ($freq < 1) {
2031 $newFreq = $freq * $mapFactor;
2032 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2033 } else {
2034 $newFreq = $freq / $mapFactor;
2035 }
2036 return $newFreq;
2037 }
2038
2039 /********************************
2040 *
2041 * Hashing
2042 *
2043 *******************************/
2044 /**
2045 * Get search hash, T3 pages
2046 *
2047 * @return void
2048 * @todo Define visibility
2049 */
2050 public function setT3Hashes() {
2051 // Set main array:
2052 $hArray = array(
2053 'id' => (int) $this->conf['id'],
2054 'type' => (int) $this->conf['type'],
2055 'sys_lang' => (int) $this->conf['sys_language_uid'],
2056 'MP' => (string) $this->conf['MP'],
2057 'cHash' => $this->cHashParams
2058 );
2059 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2060 $this->hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2061 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2062 $hArray['gr_list'] = (string) $this->conf['gr_list'];
2063 $this->hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2064 }
2065
2066 /**
2067 * Get search hash, external files
2068 *
2069 * @param string File name / path which identifies it on the server
2070 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2071 * @return array Array with "phash_grouping" and "phash" inside.
2072 * @todo Define visibility
2073 */
2074 public function setExtHashes($file, $subinfo = array()) {
2075 // Set main array:
2076 $hash = array();
2077 $hArray = array(
2078 'file' => $file
2079 );
2080 // Set grouping hash:
2081 $hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2082 // Add subinfo
2083 $hArray['subinfo'] = $subinfo;
2084 $hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2085 return $hash;
2086 }
2087
2088 /*********************************
2089 *
2090 * Internal logging functions
2091 *
2092 *********************************/
2093 /**
2094 * Push function wrapper for TT logging
2095 *
2096 * @param string Title to set
2097 * @param string Key (?)
2098 * @return void
2099 * @todo Define visibility
2100 */
2101 public function log_push($msg, $key) {
2102 if (is_object($GLOBALS['TT'])) {
2103 $GLOBALS['TT']->push($msg, $key);
2104 }
2105 }
2106
2107 /**
2108 * Pull function wrapper for TT logging
2109 *
2110 * @return void
2111 * @todo Define visibility
2112 */
2113 public function log_pull() {
2114 if (is_object($GLOBALS['TT'])) {
2115 $GLOBALS['TT']->pull();
2116 }
2117 }
2118
2119 /**
2120 * Set log message function wrapper for TT logging
2121 *
2122 * @param string Message to set
2123 * @param integer Error number
2124 * @return void
2125 * @todo Define visibility
2126 */
2127 public function log_setTSlogMessage($msg, $errorNum = 0) {
2128 if (is_object($GLOBALS['TT'])) {
2129 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2130 }
2131 $this->internal_log[] = $msg;
2132 }
2133
2134 /**************************
2135 *
2136 * tslib_fe hooks:
2137 *
2138 **************************/
2139 /**
2140 * Makes sure that keywords are space-separated. This is impotant for their
2141 * proper displaying as a part of fulltext index.
2142 *
2143 * @param string $keywordList
2144 * @return string
2145 * @see http://bugs.typo3.org/view.php?id=1436
2146 */
2147 protected function addSpacesToKeywordList($keywordList) {
2148 $keywords = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $keywordList);
2149 return ' ' . implode(', ', $keywords) . ' ';
2150 }
2151
2152 }
2153
2154
2155 ?>