[-FEATURE] Remove restricted class prefixes
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * Indexing class for TYPO3 frontend
35 *
36 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
37 * @package TYPO3
38 * @subpackage tx_indexedsearch
39 */
40 class tx_indexedsearch_indexer {
41
42 // Messages:
43 var $reasons = array(
44 -1 => 'mtime matched the document, so no changes detected and no content updated',
45 -2 => 'The minimum age was not exceeded',
46 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
47 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
48 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
49 4 => 'Page has never been indexed (is not represented in the index_phash table).'
50 );
51
52 // HTML code blocks to exclude from indexing:
53 var $excludeSections = 'script,style';
54
55 // Supported Extensions for external files:
56 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
57
58 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
59 var $defaultGrList = '0,-1';
60
61 // Min/Max times:
62 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
63 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
64 var $maxExternalFiles = 0; // Max number of external files to index.
65
66 var $forceIndexing = FALSE; // If TRUE, indexing is forced despite of hashes etc.
67 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
68
69 // INTERNALS:
70 var $defaultContentArray=array(
71 'title' => '',
72 'description' => '',
73 'keywords' => '',
74 'body' => '',
75 );
76 var $wordcount = 0;
77 var $externalFileCounter = 0;
78
79 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
80 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
81 var $hash = array(); // Hash array, contains phash and phash_grouping
82 var $file_phash_arr = array(); // Hash array for files
83 var $contentParts = array(); // Content of TYPO3 page
84 var $content_md5h = '';
85 var $internal_log = array(); // Internal log
86 var $indexExternalUrl_content = '';
87
88 var $cHashParams = array(); // cHashparams array
89
90 var $freqRange = 32000;
91 var $freqMax = 0.1;
92
93 var $enableMetaphoneSearch = FALSE;
94 var $storeMetaphoneInfoAsWords;
95 var $metaphoneContent = '';
96
97 // Objects:
98 /**
99 * Charset class object
100 *
101 * @var t3lib_cs
102 */
103 var $csObj;
104
105 /**
106 * Metaphone object, if any
107 *
108 * @var user_DoubleMetaPhone
109 */
110 var $metaphoneObj;
111
112 /**
113 * Lexer object for word splitting
114 *
115 * @var tx_indexedsearch_lexer
116 */
117 var $lexerObj;
118
119 var $flagBitMask;
120
121 /**
122 * Parent Object (TSFE) Initialization
123 *
124 * @param object Parent Object (frontend TSFE object), passed by reference
125 * @return void
126 */
127 function hook_indexContent(&$pObj) {
128
129 // Indexer configuration from Extension Manager interface:
130 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
131
132 // Crawler activation:
133 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
134 if (t3lib_extMgm::isLoaded('crawler')
135 && $pObj->applicationData['tx_crawler']['running']
136 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
137
138 // Setting simple log message:
139 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
140
141 // Setting variables:
142 $this->crawlerActive = TRUE; // Crawler active flag
143 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
144 }
145
146 // Determine if page should be indexed, and if so, configure and initialize indexer
147 if ($pObj->config['config']['index_enable']) {
148 $this->log_push('Index page', '');
149
150 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
151 if (!$pObj->page['no_search']) {
152 if (!$pObj->no_cache) {
153 if (!strcmp($pObj->sys_language_uid, $pObj->sys_language_content)) {
154
155 // Setting up internal configuration from config array:
156 $this->conf = array();
157
158 // Information about page for which the indexing takes place
159 $this->conf['id'] = $pObj->id; // Page id
160 $this->conf['type'] = $pObj->type; // Page type
161 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
162 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
163 $this->conf['gr_list'] = $pObj->gr_list; // Group list
164
165 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
166 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
167
168 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
169 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
170
171 // Root line uids
172 $this->conf['rootline_uids'] = array();
173 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
174 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
175 }
176
177 // Content of page:
178 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
179 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
180 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
181 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
182
183 // Configuration of behavior:
184 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
185 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
186 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
187
188 // Set to zero:
189 $this->conf['recordUid'] = 0;
190 $this->conf['freeIndexUid'] = 0;
191 $this->conf['freeIndexSetId'] = 0;
192
193 // Init and start indexing:
194 $this->init();
195 $this->indexTypo3PageContent();
196 } else {
197 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
198 }
199 } else {
200 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
201 }
202 } else {
203 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
204 }
205 } else {
206 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
207 }
208 $this->log_pull();
209 }
210 }
211
212
213
214
215
216
217
218
219 /****************************
220 *
221 * Backend API
222 *
223 ****************************/
224
225 /**
226 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
227 *
228 * @param integer The page uid, &id=
229 * @param integer The page type, &type=
230 * @param integer sys_language uid, typically &L=
231 * @param string The MP variable (Mount Points), &MP=
232 * @param array Rootline array of only UIDs.
233 * @param array Array of GET variables to register with this indexing
234 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
235 * @return void
236 */
237 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
238
239 // Setting up internal configuration from config array:
240 $this->conf = array();
241
242 // Information about page for which the indexing takes place
243 $this->conf['id'] = $id; // Page id (integer)
244 $this->conf['type'] = $type; // Page type (integer)
245 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
246 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
247 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
248
249 // cHash values:
250 $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : ''; // cHash string for additional parameters
251 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
252
253 // Set to defaults
254 $this->conf['freeIndexUid'] = 0;
255 $this->conf['freeIndexSetId'] = 0;
256 $this->conf['page_cache_reg1'] = '';
257
258 // Root line uids
259 $this->conf['rootline_uids'] = $uidRL;
260
261 // Configuration of behavior:
262 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
263 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
264 $this->conf['index_metatags'] = TRUE; // Whether to index document keywords and description (if present)
265
266 // Init and start indexing:
267 $this->init();
268 }
269
270 /**
271 * Sets the free-index uid. Can be called right after backend_initIndexer()
272 *
273 * @param integer Free index UID
274 * @param integer Set id - an integer identifying the "set" of indexing operations.
275 * @return void
276 */
277 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
278 $this->conf['freeIndexUid'] = $freeIndexUid;
279 $this->conf['freeIndexSetId'] = $freeIndexSetId;
280 }
281
282 /**
283 * Indexing records as the content of a TYPO3 page.
284 *
285 * @param string Title equivalent
286 * @param string Keywords equivalent
287 * @param string Description equivalent
288 * @param string The main content to index
289 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
290 * @param integer Last modification time, in seconds
291 * @param integer The creation date of the content, in seconds
292 * @param integer The record UID that the content comes from (for registration with the indexed rows)
293 * @return void
294 */
295 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
296
297 // Content of page:
298 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
299 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
300 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
301
302 // Construct fake HTML for parsing:
303 $this->conf['content'] = '
304 <html>
305 <head>
306 <title>'.htmlspecialchars($title).'</title>
307 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
308 <meta name="description" content="'.htmlspecialchars($description).'" />
309 </head>
310 <body>
311 '.htmlspecialchars($content).'
312 </body>
313 </html>'; // Content string (HTML of TYPO3 page)
314
315 // Initializing charset:
316 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
317 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
318
319 // Index content as if it was a TYPO3 page:
320 $this->indexTypo3PageContent();
321 }
322
323
324
325
326
327
328
329
330
331
332
333
334
335 /********************************
336 *
337 * Initialization
338 *
339 *******************************/
340
341 /**
342 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
343 *
344 * @return void
345 */
346 function init() {
347 global $TYPO3_CONF_VARS;
348
349 // Initializing:
350 $this->cHashParams = $this->conf['cHash_array'];
351 if (is_array($this->cHashParams) && count($this->cHashParams)) {
352 if ($this->conf['cHash']) {
353 // Add this so that URL's come out right...
354 $this->cHashParams['cHash'] = $this->conf['cHash'];
355 }
356 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
357 }
358
359 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
360 $this->setT3Hashes();
361
362 // Indexer configuration from Extension Manager interface:
363 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
364 $this->tstamp_minAge = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['minAge']*3600, 0);
365 $this->tstamp_maxAge = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['maxAge']*3600, 0);
366 $this->maxExternalFiles = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
367 $this->flagBitMask = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
368
369 // Workaround: If the extension configuration was not updated yet, the value is not existing
370 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
371
372 $this->storeMetaphoneInfoAsWords = tx_indexedsearch_util::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
373
374 // Initialize external document parsers:
375 // Example configuration, see ext_localconf.php of this file!
376 if ($this->conf['index_externals']) {
377 $this->initializeExternalParsers();
378 }
379
380 // Initialize lexer (class that deconstructs the text into words):
381 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
382 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
383 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
384 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
385 $this->lexerObj = t3lib_div::getUserObj($lexerObjRef);
386 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
387
388 // Initialize metaphone hook:
389 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
390 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
391 if ($this->enableMetaphoneSearch && $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
392 $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
393 $this->metaphoneObj->pObj = $this;
394 }
395
396 // Init charset class:
397 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
398 }
399
400 /**
401 * Initialize external parsers
402 *
403 * @return void
404 * @access private
405 * @see init()
406 */
407 function initializeExternalParsers() {
408 global $TYPO3_CONF_VARS;
409
410 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
411 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
412 $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef);
413 $this->external_parsers[$extension]->pObj = $this;
414
415 // Init parser and if it returns FALSE, unset its entry again:
416 if (!$this->external_parsers[$extension]->initParser($extension)) {
417 unset($this->external_parsers[$extension]);
418 }
419 }
420 }
421 }
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437 /********************************
438 *
439 * Indexing; TYPO3 pages (HTML content)
440 *
441 *******************************/
442
443 /**
444 * Start indexing of the TYPO3 page
445 *
446 * @return void
447 */
448 function indexTypo3PageContent() {
449
450 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
451 $is_grlist = $this->is_grlist_set($this->hash['phash']);
452
453 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
454
455 // Setting message:
456 if ($this->forceIndexing) {
457 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
458 } elseif ($check > 0) {
459 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
460 } else {
461 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
462 }
463
464 // Divide into title,keywords,description and body:
465 $this->log_push('Split content', '');
466 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
467 if ($this->conf['indexedDocTitle']) {
468 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
469 }
470 $this->log_pull();
471
472 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
473 $this->content_md5h = tx_indexedsearch_util::md5inthash(implode('', $this->contentParts));
474
475 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
476 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
477 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
478 $checkCHash = $this->checkContentHash();
479 if (!is_array($checkCHash) || $check===1) {
480 $Pstart=t3lib_div::milliseconds();
481
482 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
483 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
484 $this->log_pull();
485
486 // Splitting words
487 $this->log_push('Extract words from content', '');
488 $splitInWords = $this->processWordsInArrays($this->contentParts);
489 $this->log_pull();
490
491 // Analyse the indexed words.
492 $this->log_push('Analyse the extracted words', '');
493 $indexArr = $this->indexAnalyze($splitInWords);
494 $this->log_pull();
495
496 // Submitting page (phash) record
497 $this->log_push('Submitting page', '');
498 $this->submitPage();
499 $this->log_pull();
500
501 // Check words and submit to word list if not there
502 $this->log_push('Check word list and submit words', '');
503 if (tx_indexedsearch_util::isTableUsed('index_words')) {
504 $this->checkWordList($indexArr);
505 $this->submitWords($indexArr, $this->hash['phash']);
506 }
507 $this->log_pull();
508
509 // Set parsetime
510 $this->updateParsetime($this->hash['phash'], t3lib_div::milliseconds() - $Pstart);
511
512 // Checking external files if configured for.
513 $this->log_push('Checking external files', '');
514 if ($this->conf['index_externals']) {
515 $this->extractLinks($this->conf['content']);
516 }
517 $this->log_pull();
518 } else {
519 // Update the timestamp
520 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
521 $this->updateSetId($this->hash['phash']);
522 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
523 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
524 $this->updateRootline();
525 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
526 }
527 } else {
528 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
529 }
530 }
531
532 /**
533 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
534 *
535 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
536 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
537 * @see splitRegularContent()
538 */
539 function splitHTMLContent($content) {
540
541 // divide head from body ( u-ouh :) )
542 $contentArr = $this->defaultContentArray;
543 $contentArr['body'] = stristr($content, '<body');
544 $headPart = substr($content, 0, -strlen($contentArr['body']));
545
546 // get title
547 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
548 $titleParts = explode(':', $contentArr['title'], 2);
549 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
550
551 // get keywords and description metatags
552 if ($this->conf['index_metatags']) {
553 $meta = array();
554 $i = 0;
555 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
556 $i++;
557 }
558 // TODO The code below stops at first unset tag. Is that correct?
559 for ($i = 0; isset($meta[$i]); $i++) {
560 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
561 if (stristr($meta[$i]['name'], 'keywords')) {
562 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
563 }
564 if (stristr($meta[$i]['name'], 'description')) {
565 $contentArr['description'] .= ',' . $meta[$i]['content'];
566 }
567 }
568 }
569
570 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
571 $this->typoSearchTags($contentArr['body']);
572
573 // Get rid of unwanted sections (ie. scripting and style stuff) in body
574 $tagList = explode(',', $this->excludeSections);
575 foreach ($tagList as $tag) {
576 while($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2));
577 }
578
579 // remove tags, but first make sure we don't concatenate words by doing it
580 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
581 $contentArr['body'] = trim(strip_tags($contentArr['body']));
582
583 $contentArr['keywords'] = trim($contentArr['keywords']);
584 $contentArr['description'] = trim($contentArr['description']);
585
586 // Return array
587 return $contentArr;
588 }
589
590 /**
591 * Extract the charset value from HTML meta tag.
592 *
593 * @param string HTML content
594 * @return string The charset value if found.
595 */
596 function getHTMLcharset($content) {
597 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
598 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
599 return $reg2[1];
600 }
601 }
602 }
603
604 /**
605 * Converts a HTML document to utf-8
606 *
607 * @param string HTML content, any charset
608 * @param string Optional charset (otherwise extracted from HTML)
609 * @return string Converted HTML
610 */
611 function convertHTMLToUtf8($content, $charset = '') {
612
613 // Find charset:
614 $charset = $charset ? $charset : $this->getHTMLcharset($content);
615 $charset = $this->csObj->parse_charset($charset);
616
617 // Convert charset:
618 if ($charset && $charset!=='utf-8') {
619 $content = $this->csObj->utf8_encode($content, $charset);
620 }
621 // Convert entities, assuming document is now UTF-8:
622 $content = $this->csObj->entities_to_utf8($content, TRUE);
623
624 return $content;
625 }
626
627 /**
628 * Finds first occurence of embracing tags and returns the embraced content and the original string with
629 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
630 * <title> of document or removing <script>-sections
631 *
632 * @param string String to search in
633 * @param string Tag name, eg. "script"
634 * @param string Passed by reference: Content inside found tag
635 * @param string Passed by reference: Content after found tag
636 * @param string Passed by reference: Attributes of the found tag.
637 * @return boolean Returns FALSE if tag was not found, otherwise TRUE.
638 */
639 function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
640 $endTag = '</'.$tagName.'>';
641 $startTag = '<'.$tagName;
642
643 // stristr used because we want a case-insensitive search for the tag.
644 $isTagInText = stristr($string, $startTag);
645 // if the tag was not found, return FALSE
646 if(!$isTagInText) {
647 return FALSE;
648 }
649
650 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
651 $afterTagInText = stristr($isTagInText, $endTag);
652 if ($afterTagInText) {
653 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
654 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
655 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
656 // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
657 } else {
658 $tagContent='';
659 $stringAfter = $isTagInText;
660 }
661
662 return TRUE;
663 }
664
665 /**
666 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
667 *
668 * @param string HTML Content, passed by reference
669 * @return boolean Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
670 */
671 function typoSearchTags(&$body) {
672 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/', $body);
673
674 if(count($expBody)>1) {
675 $body = '';
676
677 foreach($expBody as $val) {
678 $part = explode('-->', $val, 2);
679 if(trim($part[0])=='begin') {
680 $body.= $part[1];
681 $prev = '';
682 } elseif(trim($part[0])=='end') {
683 $body.= $prev;
684 } else {
685 $prev = $val;
686 }
687 }
688 return TRUE;
689 } else {
690 return FALSE;
691 }
692 }
693
694 /**
695 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
696 *
697 * @param string HTML content
698 * @return void
699 */
700 function extractLinks($content) {
701
702 // Get links:
703 $list = $this->extractHyperLinks($content);
704
705 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
706 $this->includeCrawlerClass();
707 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
708 }
709
710 // Traverse links:
711 foreach($list as $linkInfo) {
712
713 // Decode entities:
714 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
715 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
716 } else {
717 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
718 }
719
720 // Parse URL:
721 $qParts = parse_url($linkSource);
722
723 // Check for jumpurl (TYPO3 specific thing...)
724 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
725 parse_str($qParts['query'], $getP);
726 $linkSource = $getP['jumpurl'];
727 $qParts = parse_url($linkSource); // parse again due to new linkSource!
728 }
729
730 if (!$linkInfo['localPath'] && $qParts['scheme']) {
731 if ($this->indexerConfig['indexExternalURLs']) {
732 // Index external URL (http or otherwise)
733 $this->indexExternalUrl($linkSource);
734 }
735 } elseif (!$qParts['query']) {
736 $linkSource = urldecode($linkSource);
737 if (t3lib_div::isAllowedAbsPath($linkSource)) {
738 $localFile = $linkSource;
739 } else {
740 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
741 }
742 if ($localFile && @is_file($localFile)) {
743
744 // Index local file:
745 if ($linkInfo['localPath']) {
746
747 $fI = pathinfo($linkSource);
748 $ext = strtolower($fI['extension']);
749 if (is_object($crawler)) {
750 $params = array(
751 'document' => $linkSource,
752 'alturl' => $linkInfo['href'],
753 'conf' => $this->conf
754 );
755 unset($params['conf']['content']);
756
757 $crawler->addQueueEntry_callBack(0, $params, 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files', $this->conf['id']);
758 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
759 } else {
760 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
761 }
762 } else {
763 if (is_object($crawler)) {
764 $params = array(
765 'document' => $linkSource,
766 'conf' => $this->conf
767 );
768 unset($params['conf']['content']);
769 $crawler->addQueueEntry_callBack(0, $params, 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files', $this->conf['id']);
770 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
771 } else {
772 $this->indexRegularDocument($linkSource);
773 }
774 }
775 }
776 }
777 }
778 }
779
780 /**
781 * Extracts all links to external documents from the HTML content string
782 *
783 * @param string $html
784 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
785 * @see extractLinks()
786 */
787 function extractHyperLinks($html) {
788 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
789 $htmlParts = $htmlParser->splitTags('a', $html);
790 $hyperLinksData = array();
791 foreach ($htmlParts as $index => $tagData) {
792 if (($index % 2) !== 0) {
793 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
794 $firstTagName = $htmlParser->getFirstTagName($tagData);
795
796 if (strtolower($firstTagName) == 'a') {
797 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
798 $hyperLinksData[] = array(
799 'tag' => $tagData,
800 'href' => $tagAttributes[0]['href'],
801 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
802 );
803 }
804 }
805 }
806 }
807
808 return $hyperLinksData;
809 }
810
811 /**
812 * Extracts the "base href" from content string.
813 *
814 * @param string Content to analyze
815 * @return string The base href or an empty string if not found
816 */
817 public function extractBaseHref($html) {
818 $href = '';
819 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
820 $htmlParts = $htmlParser->splitTags('base', $html);
821 foreach ($htmlParts as $index => $tagData) {
822 if (($index % 2) !== 0) {
823 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
824 $firstTagName = $htmlParser->getFirstTagName($tagData);
825 if (strtolower($firstTagName) == 'base') {
826 $href = $tagAttributes[0]['href'];
827 if ($href) {
828 break;
829 }
830 }
831 }
832 }
833
834 return $href;
835 }
836
837 /******************************************
838 *
839 * Indexing; external URL
840 *
841 ******************************************/
842
843 /**
844 * Index External URLs HTML content
845 *
846 * @param string URL, eg. "http://typo3.org/"
847 * @return void
848 * @see indexRegularDocument()
849 */
850 function indexExternalUrl($externalUrl) {
851
852 // Parse External URL:
853 $qParts = parse_url($externalUrl);
854 $fI = pathinfo($qParts['path']);
855 $ext = strtolower($fI['extension']);
856
857 // Get headers:
858 $urlHeaders = $this->getUrlHeaders($externalUrl);
859 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
860 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
861 if (strlen($content)) {
862
863 // Create temporary file:
864 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL');
865 if ($tmpFile) {
866 t3lib_div::writeFile($tmpFile, $content);
867
868 // Index that file:
869 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
870 unlink($tmpFile);
871 }
872 }
873 }
874 }
875
876 /**
877 * Getting HTTP request headers of URL
878 *
879 * @param string The URL
880 * @param integer Timeout (seconds?)
881 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
882 */
883 function getUrlHeaders($url) {
884 // Try to get the headers only
885 $content = t3lib_div::getUrl($url, 2);
886
887 if (strlen($content)) {
888 // Compile headers:
889 $headers = t3lib_div::trimExplode(LF, $content, 1);
890 $retVal = array();
891 foreach($headers as $line) {
892 if (!strlen(trim($line))) {
893 break; // Stop at the first empty line (= end of header)
894 }
895
896 list($headKey, $headValue) = explode(':', $line, 2);
897 $retVal[$headKey] = $headValue;
898 }
899 return $retVal;
900 }
901 }
902
903
904
905 /**
906 * Checks if the file is local
907 *
908 * @param $sourcePath
909 * @return string Absolute path to file if file is local, else empty string
910 */
911 protected function createLocalPath($sourcePath) {
912 $localPath = '';
913 static $pathFunctions = array(
914 'createLocalPathFromT3vars',
915 'createLocalPathUsingAbsRefPrefix',
916 'createLocalPathUsingDomainURL',
917 'createLocalPathFromAbsoluteURL',
918 'createLocalPathFromRelativeURL'
919 );
920 foreach ($pathFunctions as $functionName) {
921 $localPath = $this->$functionName($sourcePath);
922 if ($localPath != '') {
923 break;
924 }
925 }
926 return $localPath;
927 }
928
929 /**
930 * Attempts to create a local file path from T3VARs. This is useful for
931 * various download extensions that hide actual file name but still want the
932 * file to be indexed.
933 *
934 * @param string $sourcePath
935 * @return string
936 */
937 protected function createLocalPathFromT3vars($sourcePath) {
938 $localPath = '';
939 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
940 if (is_array($indexLocalFiles)) {
941 $md5 = t3lib_div::shortMD5($sourcePath);
942 // Note: not using self::isAllowedLocalFile here because this method
943 // is allowed to index files outside of the web site (for example,
944 // protected downloads)
945 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
946 $localPath = $indexLocalFiles[$md5];
947 }
948 }
949 return $localPath;
950 }
951
952 /**
953 * Attempts to create a local file path by matching a current request URL.
954 *
955 * @param string $sourcePath
956 * @return string
957 */
958 protected function createLocalPathUsingDomainURL($sourcePath) {
959 $localPath = '';
960 $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
961 $baseURLLength = strlen($baseURL);
962 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
963 $sourcePath = substr($sourcePath, $baseURLLength);
964 $localPath = PATH_site . $sourcePath;
965 if (!self::isAllowedLocalFile($localPath)) {
966 $localPath = '';
967 }
968 }
969 return $localPath;
970 }
971
972 /**
973 * Attempts to create a local file path by matching absRefPrefix. This
974 * requires TSFE. If TSFE is missing, this function does nothing.
975 *
976 * @param string $sourcePath
977 * @return string
978 */
979 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
980 $localPath = '';
981 if ($GLOBALS['TSFE'] instanceof tslib_fe) {
982 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
983 $absRefPrefixLength = strlen($absRefPrefix);
984 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
985 $sourcePath = substr($sourcePath, $absRefPrefixLength);
986 $localPath = PATH_site . $sourcePath;
987 if (!self::isAllowedLocalFile($localPath)) {
988 $localPath = '';
989 }
990 }
991 }
992 return $localPath;
993 }
994
995 /**
996 * Attempts to create a local file path from the absolute URL without
997 * schema.
998 *
999 * @param string $sourcePath
1000 * @return string
1001 */
1002 protected function createLocalPathFromAbsoluteURL($sourcePath) {
1003 $localPath = '';
1004 if ($sourcePath{0} == '/') {
1005 $sourcePath = substr($sourcePath, 1);
1006 $localPath = PATH_site . $sourcePath;
1007 if (!self::isAllowedLocalFile($localPath)) {
1008 $localPath = '';
1009 }
1010 }
1011 return $localPath;
1012 }
1013
1014 /**
1015 * Attempts to create a local file path from the relative URL.
1016 *
1017 * @param string $sourcePath
1018 * @return string
1019 */
1020 protected function createLocalPathFromRelativeURL($sourcePath) {
1021 $localPath = '';
1022 if (self::isRelativeURL($sourcePath)) {
1023 $localPath = PATH_site . $sourcePath;
1024 if (!self::isAllowedLocalFile($localPath)) {
1025 $localPath = '';
1026 }
1027 }
1028 return $localPath;
1029 }
1030
1031 /**
1032 * Checks if URL is relative.
1033 *
1034 * @param string $url
1035 * @return boolean
1036 */
1037 static protected function isRelativeURL($url) {
1038 $urlParts = @parse_url($url);
1039 return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
1040 }
1041
1042 /**
1043 * Checks if the path points to the file inside the web site
1044 *
1045 * @param string $filePath
1046 * @return boolean
1047 */
1048 static protected function isAllowedLocalFile($filePath) {
1049 $filePath = t3lib_div::resolveBackPath($filePath);
1050 $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
1051 $isFile = is_file($filePath);
1052 return $insideWebPath && $isFile;
1053 }
1054
1055 /******************************************
1056 *
1057 * Indexing; external files (PDF, DOC, etc)
1058 *
1059 ******************************************/
1060
1061 /**
1062 * Indexing a regular document given as $file (relative to PATH_site, local file)
1063 *
1064 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1065 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1066 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1067 * @param string File extension for temporary file.
1068 * @return void
1069 */
1070 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
1071
1072 // Init
1073 $fI = pathinfo($file);
1074 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
1075
1076 // Create abs-path:
1077 if (!$contentTmpFile) {
1078 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
1079 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
1080 } else { // Absolute, pass-through:
1081 $absFile = $file;
1082 }
1083 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
1084 } else {
1085 $absFile = $contentTmpFile;
1086 }
1087
1088 // Indexing the document:
1089 if ($absFile && @is_file($absFile)) {
1090 if ($this->external_parsers[$ext]) {
1091 $mtime = filemtime($absFile);
1092 $cParts = $this->fileContentParts($ext, $absFile);
1093
1094 foreach($cParts as $cPKey) {
1095 $this->internal_log = array();
1096 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1097 $Pstart = t3lib_div::milliseconds();
1098 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1099 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file, $subinfo);
1100 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1101 if ($check > 0 || $force) {
1102 if ($check > 0) {
1103 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1104 } else {
1105 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1106 }
1107
1108 // Check external file counter:
1109 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1110
1111 // Divide into title,keywords,description and body:
1112 $this->log_push('Split content', '');
1113 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1114 $this->log_pull();
1115
1116 if (is_array($contentParts)) {
1117 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1118 $content_md5h = tx_indexedsearch_util::md5inthash(implode($contentParts, ''));
1119
1120 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1121
1122 // Increment counter:
1123 $this->externalFileCounter++;
1124
1125 // Splitting words
1126 $this->log_push('Extract words from content', '');
1127 $splitInWords = $this->processWordsInArrays($contentParts);
1128 $this->log_pull();
1129
1130 // Analyse the indexed words.
1131 $this->log_push('Analyse the extracted words', '');
1132 $indexArr = $this->indexAnalyze($splitInWords);
1133 $this->log_pull();
1134
1135 // Submitting page (phash) record
1136 $this->log_push('Submitting page', '');
1137 $size = filesize($absFile);
1138 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1139 $ctime = filemtime($absFile);
1140 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts);
1141 $this->log_pull();
1142
1143 // Check words and submit to word list if not there
1144 $this->log_push('Check word list and submit words', '');
1145 if (tx_indexedsearch_util::isTableUsed('index_words')) {
1146 $this->checkWordList($indexArr);
1147 $this->submitWords($indexArr, $phash_arr['phash']);
1148 }
1149 $this->log_pull();
1150
1151 // Set parsetime
1152 $this->updateParsetime($phash_arr['phash'], t3lib_div::milliseconds() - $Pstart);
1153 } else {
1154 // Update the timestamp
1155 $this->updateTstamp($phash_arr['phash'], $mtime);
1156 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1157 }
1158 } else {
1159 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1160 }
1161 } else {
1162 $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1163 }
1164 } else {
1165 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1166 }
1167
1168 // Checking and setting sections:
1169 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1170 $this->log_pull();
1171 }
1172 } else {
1173 $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1174 }
1175 } else {
1176 $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1177 }
1178 }
1179
1180 /**
1181 * Reads the content of an external file being indexed.
1182 * The content from the external parser MUST be returned in utf-8!
1183 *
1184 * @param string File extension, eg. "pdf", "doc" etc.
1185 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1186 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1187 * @return array Standard content array (title, description, keywords, body keys)
1188 */
1189 function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1190 $contentArray = NULL;
1191
1192 // Consult relevant external document parser:
1193 if (is_object($this->external_parsers[$fileExtension])) {
1194 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1195 }
1196
1197 return $contentArray;
1198 }
1199
1200 /**
1201 * Creates an array with pointers to divisions of document.
1202 *
1203 * @param string File extension
1204 * @param string Absolute filename (must exist and be validated OK before calling function)
1205 * @return array Array of pointers to sections that the document should be divided into
1206 */
1207 function fileContentParts($ext, $absFile) {
1208 $cParts = array(0);
1209
1210 // Consult relevant external document parser:
1211 if (is_object($this->external_parsers[$ext])) {
1212 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1213 }
1214
1215 return $cParts;
1216 }
1217
1218 /**
1219 * Splits non-HTML content (from external files for instance)
1220 *
1221 * @param string Input content (non-HTML) to index.
1222 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1223 * @see splitHTMLContent()
1224 */
1225 function splitRegularContent($content) {
1226 $contentArr = $this->defaultContentArray;
1227 $contentArr['body'] = $content;
1228
1229 return $contentArr;
1230 }
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245 /**********************************
1246 *
1247 * Analysing content, Extracting words
1248 *
1249 **********************************/
1250
1251 /**
1252 * Convert character set and HTML entities in the value of input content array keys
1253 *
1254 * @param array Standard content array
1255 * @param string Charset of the input content (converted to utf-8)
1256 * @return void
1257 */
1258 function charsetEntity2utf8(&$contentArr, $charset) {
1259
1260 // Convert charset if necessary
1261 foreach ($contentArr as $key => $value) {
1262 if (strlen($contentArr[$key])) {
1263
1264 if ($charset!=='utf-8') {
1265 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1266 }
1267
1268 // decode all numeric / html-entities in the string to real characters:
1269 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1270 }
1271 }
1272 }
1273
1274 /**
1275 * Processing words in the array from split*Content -functions
1276 *
1277 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1278 * @return array Content input array modified so each key is not a unique array of words
1279 */
1280 function processWordsInArrays($contentArr) {
1281
1282 // split all parts to words
1283 foreach ($contentArr as $key => $value) {
1284 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1285 }
1286
1287 // For title, keywords, and description we don't want duplicates:
1288 $contentArr['title'] = array_unique($contentArr['title']);
1289 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1290 $contentArr['description'] = array_unique($contentArr['description']);
1291
1292 // Return modified array:
1293 return $contentArr;
1294 }
1295
1296 /**
1297 * Extracts the sample description text from the content array.
1298 *
1299 * @param array Content array
1300 * @return string Description string
1301 */
1302 function bodyDescription($contentArr) {
1303
1304 // Setting description
1305 $maxL = t3lib_utility_Math::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1306 if ($maxL) {
1307 $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1308
1309 // Shorten the string:
1310 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1311 }
1312
1313 return $bodyDescription;
1314 }
1315
1316 /**
1317 * Analyzes content to use for indexing,
1318 *
1319 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1320 * @return array Index Array (whatever that is...)
1321 */
1322 function indexAnalyze($content) {
1323 $indexArr = Array();
1324 $counter = 0;
1325
1326 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1327 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1328 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1329 $this->analyzeBody($indexArr, $content);
1330
1331 return $indexArr;
1332 }
1333
1334 /**
1335 * Calculates relevant information for headercontent
1336 *
1337 * @param array Index array, passed by reference
1338 * @param array Standard content array
1339 * @param string Key from standard content array
1340 * @param integer Bit-wise priority to type
1341 * @return void
1342 */
1343 function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1344 foreach ($content[$key] as $val) {
1345 $val = substr($val, 0, 60); // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1346
1347 if (!isset($retArr[$val])) {
1348 // Word ID (wid)
1349 $retArr[$val]['hash'] = tx_indexedsearch_util::md5inthash($val);
1350
1351 // Metaphone value is also 60 only chars long
1352 $metaphone = $this->enableMetaphoneSearch
1353 ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60)
1354 : '';
1355 $retArr[$val]['metaphone'] = $metaphone;
1356 }
1357
1358 // Build metaphone fulltext string (can be used for fulltext indexing)
1359 if ($this->storeMetaphoneInfoAsWords) {
1360 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1361 }
1362
1363 // Priority used for flagBitMask feature (see extension configuration)
1364 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2, $offset);
1365
1366 // Increase number of occurences
1367 $retArr[$val]['count']++;
1368 $this->wordcount++;
1369 }
1370 }
1371
1372 /**
1373 * Calculates relevant information for bodycontent
1374 *
1375 * @param array Index array, passed by reference
1376 * @param array Standard content array
1377 * @return void
1378 */
1379 function analyzeBody(&$retArr, $content) {
1380 foreach ($content['body'] as $key => $val) {
1381 $val = substr($val, 0, 60); // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1382
1383 if (!isset($retArr[$val])) {
1384 // First occurence (used for ranking results)
1385 $retArr[$val]['first'] = $key;
1386
1387 // Word ID (wid)
1388 $retArr[$val]['hash'] = tx_indexedsearch_util::md5inthash($val);
1389
1390 // Metaphone value is also only 60 chars long
1391 $metaphone = $this->enableMetaphoneSearch
1392 ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60)
1393 : '';
1394 $retArr[$val]['metaphone'] = $metaphone;
1395 }
1396
1397 // Build metaphone fulltext string (can be used for fulltext indexing)
1398 if ($this->storeMetaphoneInfoAsWords) {
1399 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1400 }
1401
1402 // Increase number of occurences
1403 $retArr[$val]['count']++;
1404 $this->wordcount++;
1405 }
1406 }
1407
1408 /**
1409 * Creating metaphone based hash from input word
1410 *
1411 * @param string Word to convert
1412 * @param boolean If set, returns the raw metaphone value (not hashed)
1413 * @return mixed Metaphone hash integer (or raw value, string)
1414 */
1415 function metaphone($word, $returnRawMetaphoneValue=FALSE) {
1416
1417 if (is_object($this->metaphoneObj)) {
1418 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1419 } else {
1420 // Use native PHP function instead of advanced doubleMetaphone class
1421 $metaphoneRawValue = metaphone($word);
1422 }
1423
1424 if ($returnRawMetaphoneValue) {
1425 $result = $metaphoneRawValue;
1426 } elseif (strlen($metaphoneRawValue)) {
1427 // Create hash and return integer
1428 $result = tx_indexedsearch_util::md5inthash($metaphoneRawValue);
1429 } else {
1430 $result = 0;
1431 }
1432
1433 return $result;
1434 }
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451 /********************************
1452 *
1453 * SQL; TYPO3 Pages
1454 *
1455 *******************************/
1456
1457 /**
1458 * Updates db with information about the page (TYPO3 page, not external media)
1459 *
1460 * @return void
1461 */
1462 function submitPage() {
1463
1464 // Remove any current data for this phash:
1465 $this->removeOldIndexedPages($this->hash['phash']);
1466
1467 // setting new phash_row
1468 $fields = array(
1469 'phash' => $this->hash['phash'],
1470 'phash_grouping' => $this->hash['phash_grouping'],
1471 'cHashParams' => serialize($this->cHashParams),
1472 'contentHash' => $this->content_md5h,
1473 'data_page_id' => $this->conf['id'],
1474 'data_page_reg1' => $this->conf['page_cache_reg1'],
1475 'data_page_type' => $this->conf['type'],
1476 'data_page_mp' => $this->conf['MP'],
1477 'gr_list' => $this->conf['gr_list'],
1478 'item_type' => 0, // TYPO3 page
1479 'item_title' => $this->contentParts['title'],
1480 'item_description' => $this->bodyDescription($this->contentParts),
1481 'item_mtime' => $this->conf['mtime'],
1482 'item_size' => strlen($this->conf['content']),
1483 'tstamp' => $GLOBALS['EXEC_TIME'],
1484 'crdate' => $GLOBALS['EXEC_TIME'],
1485 'item_crdate' => $this->conf['crdate'], // Creation date of page
1486 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1487 'externalUrl' => 0,
1488 'recordUid' => intval($this->conf['recordUid']),
1489 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1490 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1491 );
1492
1493 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1494 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1495 }
1496
1497 // PROCESSING index_section
1498 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1499
1500 // PROCESSING index_grlist
1501 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1502
1503 // PROCESSING index_fulltext
1504 $fields = array(
1505 'phash' => $this->hash['phash'],
1506 'fulltextdata' => implode(' ', $this->contentParts),
1507 'metaphonedata' => $this->metaphoneContent
1508 );
1509 if ($this->indexerConfig['fullTextDataLength']>0) {
1510 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1511 }
1512 if (tx_indexedsearch_util::isTableUsed('index_fulltext')) {
1513 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1514 }
1515
1516 // PROCESSING index_debug
1517 if ($this->indexerConfig['debugMode']) {
1518 $fields = array(
1519 'phash' => $this->hash['phash'],
1520 'debuginfo' => serialize(array(
1521 'cHashParams' => $this->cHashParams,
1522 'external_parsers initialized' => array_keys($this->external_parsers),
1523 'conf' => array_merge($this->conf, array('content'=>substr($this->conf['content'], 0, 1000))),
1524 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1525 'logs' => $this->internal_log,
1526 'lexer' => $this->lexerObj->debugString,
1527 ))
1528 );
1529 if (tx_indexedsearch_util::isTableUsed('index_debug')) {
1530 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1531 }
1532 }
1533 }
1534
1535 /**
1536 * Stores gr_list in the database.
1537 *
1538 * @param integer Search result record phash
1539 * @param integer Actual phash of current content
1540 * @return void
1541 * @see update_grlist()
1542 */
1543 function submit_grlist($hash, $phash_x) {
1544
1545 // Setting the gr_list record
1546 $fields = array(
1547 'phash' => $hash,
1548 'phash_x' => $phash_x,
1549 'hash_gr_list' => tx_indexedsearch_util::md5inthash($this->conf['gr_list']),
1550 'gr_list' => $this->conf['gr_list']
1551 );
1552 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1553 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1554 }
1555 }
1556
1557 /**
1558 * Stores section
1559 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1560 *
1561 * @param integer phash of TYPO3 parent search result record
1562 * @param integer phash of the file indexation search record
1563 * @return void
1564 */
1565 function submit_section($hash, $hash_t3) {
1566 $fields = array(
1567 'phash' => $hash,
1568 'phash_t3' => $hash_t3,
1569 'page_id' => intval($this->conf['id'])
1570 );
1571
1572 $this->getRootLineFields($fields);
1573
1574 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1575 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1576 }
1577 }
1578
1579 /**
1580 * Removes records for the indexed page, $phash
1581 *
1582 * @param integer phash value to flush
1583 * @return void
1584 */
1585 function removeOldIndexedPages($phash) {
1586 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1587 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1588 foreach ($tableArray as $table) {
1589 if (tx_indexedsearch_util::isTableUsed($table)) {
1590 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1591 }
1592 }
1593 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1594 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1595 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . intval($phash));
1596 }
1597 }
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611 /********************************
1612 *
1613 * SQL; External media
1614 *
1615 *******************************/
1616
1617
1618 /**
1619 * Updates db with information about the file
1620 *
1621 * @param array Array with phash and phash_grouping keys for file
1622 * @param string File name
1623 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1624 * @param string File extension determining the type of media.
1625 * @param integer Modification time of file.
1626 * @param integer Creation time of file.
1627 * @param integer Size of file in bytes
1628 * @param integer Content HASH value.
1629 * @param array Standard content array (using only title and body for a file)
1630 * @return void
1631 */
1632 function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1633
1634 // Find item Type:
1635 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1636 $storeItemType = $storeItemType ? $storeItemType : $ext;
1637
1638 // Remove any current data for this phash:
1639 $this->removeOldIndexedFiles($hash['phash']);
1640
1641 // Split filename:
1642 $fileParts = parse_url($file);
1643
1644 // Setting new
1645 $fields = array(
1646 'phash' => $hash['phash'],
1647 'phash_grouping' => $hash['phash_grouping'],
1648 'cHashParams' => serialize($subinfo),
1649 'contentHash' => $content_md5h,
1650 'data_filename' => $file,
1651 'item_type' => $storeItemType,
1652 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1653 'item_description' => $this->bodyDescription($contentParts),
1654 'item_mtime' => $mtime,
1655 'item_size' => $size,
1656 'item_crdate' => $ctime,
1657 'tstamp' => $GLOBALS['EXEC_TIME'],
1658 'crdate' => $GLOBALS['EXEC_TIME'],
1659 'gr_list' => $this->conf['gr_list'],
1660 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1661 'recordUid' => intval($this->conf['recordUid']),
1662 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1663 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1664 );
1665 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1666 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1667 }
1668
1669 // PROCESSING index_fulltext
1670 $fields = array(
1671 'phash' => $hash['phash'],
1672 'fulltextdata' => implode(' ', $contentParts),
1673 'metaphonedata' => $this->metaphoneContent
1674 );
1675 if ($this->indexerConfig['fullTextDataLength']>0) {
1676 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1677 }
1678 if (tx_indexedsearch_util::isTableUsed('index_fulltext')) {
1679 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1680 }
1681
1682 // PROCESSING index_debug
1683 if ($this->indexerConfig['debugMode']) {
1684 $fields = array(
1685 'phash' => $hash['phash'],
1686 'debuginfo' => serialize(array(
1687 'cHashParams' => $subinfo,
1688 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1689 'logs' => $this->internal_log,
1690 'lexer' => $this->lexerObj->debugString,
1691 ))
1692 );
1693 if (tx_indexedsearch_util::isTableUsed('index_debug')) {
1694 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1695 }
1696 }
1697 }
1698
1699 /**
1700 * Stores file gr_list for a file IF it does not exist already
1701 *
1702 * @param integer phash value of file
1703 * @return void
1704 */
1705 function submitFile_grlist($hash) {
1706 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1707 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1708 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($hash) . ' AND (hash_gr_list=' . tx_indexedsearch_util::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . tx_indexedsearch_util::md5inthash($this->conf['gr_list']) . ')');
1709 if ($count == 0) {
1710 $this->submit_grlist($hash, $hash);
1711 }
1712 }
1713 }
1714
1715 /**
1716 * Stores file section for a file IF it does not exist
1717 *
1718 * @param integer phash value of file
1719 * @return void
1720 */
1721 function submitFile_section($hash) {
1722 // Testing if there is already a section
1723 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1724 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . intval($hash) . ' AND page_id=' . intval($this->conf['id']));
1725 if ($count == 0) {
1726 $this->submit_section($hash, $this->hash['phash']);
1727 }
1728 }
1729 }
1730
1731 /**
1732 * Removes records for the indexed page, $phash
1733 *
1734 * @param integer phash value to flush
1735 * @return void
1736 */
1737 function removeOldIndexedFiles($phash) {
1738 // Removing old registrations for tables.
1739 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1740 foreach ($tableArray as $table) {
1741 if (tx_indexedsearch_util::isTableUsed($table)) {
1742 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1743 }
1744 }
1745 }
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760 /********************************
1761 *
1762 * SQL Helper functions
1763 *
1764 *******************************/
1765
1766 /**
1767 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1768 * Return positive integer if the page needs to be indexed
1769 *
1770 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1771 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1772 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1773 */
1774 function checkMtimeTstamp($mtime, $phash) {
1775 if (!tx_indexedsearch_util::isTableUsed('index_phash')) {
1776 // Not indexed (not in index_phash)
1777 $result = 4;
1778 }
1779 else {
1780 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . intval($phash));
1781
1782 // If there was an indexing of the page...:
1783 if ($row) {
1784 if ($this->tstamp_maxAge && ($row['tstamp'] + $this->tstamp_maxAge) < $GLOBALS['EXEC_TIME']) {
1785 // If max age is exceeded, index the page
1786 // The configured max-age was exceeded for the document and thus it's indexed.
1787 $result = 1;
1788 } else {
1789 if (!$this->tstamp_minAge || ($row['tstamp'] + $this->tstamp_minAge) < $GLOBALS['EXEC_TIME']) {
1790 // if minAge is not set or if minAge is exceeded, consider at mtime
1791 if ($mtime) {
1792 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1793 if ($row['item_mtime'] != $mtime) {
1794 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1795 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1796 $result = 2;
1797 } else {
1798 // mtime matched the document, so no changes detected and no content updated
1799 $result = -1;
1800 if ($this->tstamp_maxAge) {
1801 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1802 } else {
1803 $this->updateTstamp($phash);
1804 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1805 }
1806 }
1807 } else {
1808 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1809 $result = 3;
1810 }
1811 } else {
1812 // The minimum age was not exceeded
1813 $result = -2;
1814 }
1815 }
1816 } else {
1817 // Page has never been indexed (is not represented in the index_phash table).
1818 $result = 4;
1819 }
1820 }
1821 return $result;
1822 }
1823
1824 /**
1825 * Check content hash in phash table
1826 *
1827 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1828 */
1829 function checkContentHash() {
1830 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1831 $result = TRUE;
1832 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1833 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping='.intval($this->hash['phash_grouping']).' AND contentHash='.intval($this->content_md5h));
1834 if ($row) {
1835 $result = $row;
1836 }
1837 }
1838
1839 return $result;
1840 }
1841
1842 /**
1843 * Check content hash for external documents
1844 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1845 *
1846 * @param integer phash value to check (phash_grouping)
1847 * @param integer Content hash to check
1848 * @return boolean Returns TRUE if the document needs to be indexed (that is, there was no result)
1849 */
1850 function checkExternalDocContentHash($hashGr, $content_md5h) {
1851 $result = TRUE;
1852 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1853 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . intval($hashGr) . ' AND contentHash=' . intval($content_md5h));
1854 $result = ($count == 0);
1855 }
1856
1857 return $result;
1858 }
1859
1860 /**
1861 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1862 *
1863 * @param integer Phash integer to test.
1864 * @return boolean
1865 */
1866 function is_grlist_set($phash_x) {
1867 $result = FALSE;
1868 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1869 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . intval($phash_x));
1870 $result = ($count > 0);
1871 }
1872 return $result;
1873 }
1874
1875 /**
1876 * Check if an grlist-entry for this hash exists and if not so, write one.
1877 *
1878 * @param integer phash of the search result that should be found
1879 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1880 * @return void
1881 * @see submit_grlist()
1882 */
1883 function update_grlist($phash, $phash_x) {
1884 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1885 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($phash) . ' AND hash_gr_list=' . tx_indexedsearch_util::md5inthash($this->conf['gr_list']));
1886 if ($count == 0) {
1887 $this->submit_grlist($phash, $phash_x);
1888 $this->log_setTSlogMessage("Inserted gr_list '" . $this->conf['gr_list'] . "' for phash '" . $phash . "'", 1);
1889 }
1890 }
1891 }
1892
1893 /**
1894 * Update tstamp for a phash row.
1895 *
1896 * @param integer phash value
1897 * @param integer If set, update the mtime field to this value.
1898 * @return void
1899 */
1900 function updateTstamp($phash, $mtime = 0) {
1901 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1902 $updateFields = array(
1903 'tstamp' => $GLOBALS['EXEC_TIME']
1904 );
1905 if ($mtime) {
1906 $updateFields['item_mtime'] = intval($mtime);
1907 }
1908 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1909 }
1910 }
1911
1912 /**
1913 * Update SetID of the index_phash record.
1914 *
1915 * @param integer phash value
1916 * @return void
1917 */
1918 function updateSetId($phash) {
1919 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1920 $updateFields = array(
1921 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1922 );
1923 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1924 }
1925 }
1926
1927 /**
1928 * Update parsetime for phash row.
1929 *
1930 * @param integer phash value.
1931 * @param integer Parsetime value to set.
1932 * @return void
1933 */
1934 function updateParsetime($phash, $parsetime) {
1935 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1936 $updateFields = array(
1937 'parsetime' => intval($parsetime)
1938 );
1939 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1940 }
1941 }
1942
1943 /**
1944 * Update section rootline for the page
1945 *
1946 * @return void
1947 */
1948 function updateRootline() {
1949 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1950 $updateFields = array();
1951 $this->getRootLineFields($updateFields);
1952 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . intval($this->conf['id']), $updateFields);
1953 }
1954 }
1955
1956 /**
1957 * Adding values for root-line fields.
1958 * rl0, rl1 and rl2 are standard. A hook might add more.
1959 *
1960 * @param array Field array, passed by reference
1961 * @return void
1962 */
1963 function getRootLineFields(array &$fieldArray) {
1964 $fieldArray['rl0'] = intval($this->conf['rootline_uids'][0]);
1965 $fieldArray['rl1'] = intval($this->conf['rootline_uids'][1]);
1966 $fieldArray['rl2'] = intval($this->conf['rootline_uids'][2]);
1967
1968 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1969 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1970 $fieldArray[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1971 }
1972 }
1973 }
1974
1975 /**
1976 * Removes any indexed pages with userlogins which has the same contentHash
1977 * NOT USED anywhere inside this class!
1978 *
1979 * @return void
1980 */
1981 function removeLoginpagesWithContentHash() {
1982 if (tx_indexedsearch_util::isTableUsed('index_phash') && tx_indexedsearch_util::isTableUsed('index_grlist')) {
1983 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1984 A.phash=B.phash
1985 AND A.phash_grouping='.intval($this->hash['phash_grouping']) . '
1986 AND B.hash_gr_list<>'.tx_indexedsearch_util::md5inthash($this->defaultGrList) . '
1987 AND A.contentHash='.intval($this->content_md5h));
1988 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1989 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1990 $this->removeOldIndexedPages($row['phash']);
1991 }
1992 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1993 }
1994 }
1995
1996 /**
1997 * Includes the crawler class
1998 *
1999 * @return void
2000 */
2001 function includeCrawlerClass() {
2002 t3lib_div::requireOnce(t3lib_extMgm::extPath('crawler') . 'class.tx_crawler_lib.php');
2003 }
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014 /********************************
2015 *
2016 * SQL; Submitting words
2017 *
2018 *******************************/
2019
2020 /**
2021 * Adds new words to db
2022 *
2023 * @param array $wordListArray Word List array (where each word has information about position etc).
2024 * @return void
2025 */
2026 function checkWordList($wordListArray) {
2027 if (tx_indexedsearch_util::isTableUsed('index_words')) {
2028 if (count($wordListArray)) {
2029 $phashArray = array();
2030 foreach ($wordListArray as $value) {
2031 $phashArray[] = intval($value['hash']);
2032 }
2033 $cwl = implode(',', $phashArray);
2034 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
2035 if ($count != count($wordListArray)) {
2036 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
2037 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
2038 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
2039 unset($wordListArray[$row['baseword']]);
2040 }
2041 $GLOBALS['TYPO3_DB']->sql_free_result($res);
2042
2043 foreach ($wordListArray as $key => $val) {
2044 $insertFields = array(
2045 'wid' => $val['hash'],
2046 'baseword' => $key,
2047 'metaphone' => $val['metaphone']
2048 );
2049 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
2050 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
2051 }
2052 }
2053 }
2054
2055 }
2056 }
2057
2058 /**
2059 * Submits RELATIONS between words and phash
2060 *
2061 * @param array Word list array
2062 * @param integer phash value
2063 * @return void
2064 */
2065 function submitWords($wordList, $phash) {
2066 if (tx_indexedsearch_util::isTableUsed('index_rel')) {
2067 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . intval($phash));
2068
2069 foreach ($wordList as $val) {
2070 $insertFields = array(
2071 'phash' => $phash,
2072 'wid' => $val['hash'],
2073 'count' => $val['count'],
2074 'first' => $val['first'],
2075 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
2076 'flags' => ($val['cmp'] & $this->flagBitMask)
2077 );
2078
2079 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2080 }
2081 }
2082 }
2083
2084 /**
2085 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2086 * and back.
2087 *
2088 * @param double Frequency
2089 * @return integer Frequency in range.
2090 */
2091 function freqMap($freq) {
2092 $mapFactor = $this->freqMax*100*$this->freqRange;
2093 if ($freq < 1) {
2094 $newFreq = $freq*$mapFactor;
2095 $newFreq = $newFreq>$this->freqRange ? $this->freqRange : $newFreq;
2096 } else {
2097 $newFreq = $freq/$mapFactor;
2098 }
2099 return $newFreq;
2100 }
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112 /********************************
2113 *
2114 * Hashing
2115 *
2116 *******************************/
2117
2118 /**
2119 * Get search hash, T3 pages
2120 *
2121 * @return void
2122 */
2123 function setT3Hashes() {
2124
2125 // Set main array:
2126 $hArray = array(
2127 'id' => (integer)$this->conf['id'],
2128 'type' => (integer)$this->conf['type'],
2129 'sys_lang' => (integer)$this->conf['sys_language_uid'],
2130 'MP' => (string)$this->conf['MP'],
2131 'cHash' => $this->cHashParams
2132 );
2133
2134 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2135 $this->hash['phash_grouping'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2136
2137 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2138 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2139 $this->hash['phash'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2140 }
2141
2142 /**
2143 * Get search hash, external files
2144 *
2145 * @param string File name / path which identifies it on the server
2146 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2147 * @return array Array with "phash_grouping" and "phash" inside.
2148 */
2149 function setExtHashes($file, $subinfo = array()) {
2150 // Set main array:
2151 $hash = array();
2152 $hArray = array(
2153 'file' => $file,
2154 );
2155
2156 // Set grouping hash:
2157 $hash['phash_grouping'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2158
2159 // Add subinfo
2160 $hArray['subinfo'] = $subinfo;
2161 $hash['phash'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2162
2163 return $hash;
2164 }
2165
2166 /*********************************
2167 *
2168 * Internal logging functions
2169 *
2170 *********************************/
2171
2172 /**
2173 * Push function wrapper for TT logging
2174 *
2175 * @param string Title to set
2176 * @param string Key (?)
2177 * @return void
2178 */
2179 function log_push($msg, $key) {
2180 if (is_object($GLOBALS['TT'])) {
2181 $GLOBALS['TT']->push($msg, $key);
2182 }
2183 }
2184
2185 /**
2186 * Pull function wrapper for TT logging
2187 *
2188 * @return void
2189 */
2190 function log_pull() {
2191 if (is_object($GLOBALS['TT'])) {
2192 $GLOBALS['TT']->pull();
2193 }
2194 }
2195
2196 /**
2197 * Set log message function wrapper for TT logging
2198 *
2199 * @param string Message to set
2200 * @param integer Error number
2201 * @return void
2202 */
2203 function log_setTSlogMessage($msg, $errorNum=0) {
2204 if (is_object($GLOBALS['TT'])) {
2205 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2206 }
2207 $this->internal_log[] = $msg;
2208 }
2209
2210
2211
2212
2213
2214
2215
2216
2217 /**************************
2218 *
2219 * tslib_fe hooks:
2220 *
2221 **************************/
2222
2223 /**
2224 * Makes sure that keywords are space-separated. This is impotant for their
2225 * proper displaying as a part of fulltext index.
2226 *
2227 * @param string $keywordList
2228 * @return string
2229 * @see http://bugs.typo3.org/view.php?id=1436
2230 */
2231 protected function addSpacesToKeywordList($keywordList) {
2232 $keywords = t3lib_div::trimExplode(',', $keywordList);
2233 return ' ' . implode(', ', $keywords) . ' ';
2234 }
2235 }
2236 ?>