7ebf53f6356e039b7da7846a8551bf98317a2b44
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * Indexing class for TYPO3 frontend
35 *
36 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
37 * @package TYPO3
38 * @subpackage tx_indexedsearch
39 */
40 class tx_indexedsearch_indexer {
41
42 // Messages:
43 var $reasons = array(
44 -1 => 'mtime matched the document, so no changes detected and no content updated',
45 -2 => 'The minimum age was not exceeded',
46 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
47 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
48 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
49 4 => 'Page has never been indexed (is not represented in the index_phash table).'
50 );
51
52 // HTML code blocks to exclude from indexing:
53 var $excludeSections = 'script,style';
54
55 // Supported Extensions for external files:
56 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
57
58 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
59 var $defaultGrList = '0,-1';
60
61 // Min/Max times:
62 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
63 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
64 var $maxExternalFiles = 0; // Max number of external files to index.
65
66 var $forceIndexing = FALSE; // If TRUE, indexing is forced despite of hashes etc.
67 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
68
69 // INTERNALS:
70 var $defaultContentArray=array(
71 'title' => '',
72 'description' => '',
73 'keywords' => '',
74 'body' => '',
75 );
76 var $wordcount = 0;
77 var $externalFileCounter = 0;
78
79 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
80 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
81 var $hash = array(); // Hash array, contains phash and phash_grouping
82 var $file_phash_arr = array(); // Hash array for files
83 var $contentParts = array(); // Content of TYPO3 page
84 var $content_md5h = '';
85 var $internal_log = array(); // Internal log
86 var $indexExternalUrl_content = '';
87
88 var $cHashParams = array(); // cHashparams array
89
90 var $freqRange = 32000;
91 var $freqMax = 0.1;
92
93 var $enableMetaphoneSearch = FALSE;
94 var $storeMetaphoneInfoAsWords;
95 var $metaphoneContent = '';
96
97 // Objects:
98 /**
99 * Charset class object
100 *
101 * @var t3lib_cs
102 */
103 var $csObj;
104
105 /**
106 * Metaphone object, if any
107 *
108 * @var user_DoubleMetaPhone
109 */
110 var $metaphoneObj;
111
112 /**
113 * Lexer object for word splitting
114 *
115 * @var tx_indexedsearch_lexer
116 */
117 var $lexerObj;
118
119 var $flagBitMask;
120
121 /**
122 * Parent Object (TSFE) Initialization
123 *
124 * @param object Parent Object (frontend TSFE object), passed by reference
125 * @return void
126 */
127 function hook_indexContent(&$pObj) {
128
129 // Indexer configuration from Extension Manager interface:
130 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
131
132 // Crawler activation:
133 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
134 if (t3lib_extMgm::isLoaded('crawler')
135 && $pObj->applicationData['tx_crawler']['running']
136 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
137
138 // Setting simple log message:
139 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
140
141 // Setting variables:
142 $this->crawlerActive = TRUE; // Crawler active flag
143 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
144 }
145
146 // Determine if page should be indexed, and if so, configure and initialize indexer
147 if ($pObj->config['config']['index_enable']) {
148 $this->log_push('Index page', '');
149
150 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
151 if (!$pObj->page['no_search']) {
152 if (!$pObj->no_cache) {
153 if (!strcmp($pObj->sys_language_uid, $pObj->sys_language_content)) {
154
155 // Setting up internal configuration from config array:
156 $this->conf = array();
157
158 // Information about page for which the indexing takes place
159 $this->conf['id'] = $pObj->id; // Page id
160 $this->conf['type'] = $pObj->type; // Page type
161 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
162 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
163 $this->conf['gr_list'] = $pObj->gr_list; // Group list
164
165 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
166 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
167
168 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
169 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
170
171 // Root line uids
172 $this->conf['rootline_uids'] = array();
173 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
174 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
175 }
176
177 // Content of page:
178 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
179 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
180 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
181 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
182
183 // Configuration of behavior:
184 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
185 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
186 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
187
188 // Set to zero:
189 $this->conf['recordUid'] = 0;
190 $this->conf['freeIndexUid'] = 0;
191 $this->conf['freeIndexSetId'] = 0;
192
193 // Init and start indexing:
194 $this->init();
195 $this->indexTypo3PageContent();
196 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
197 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
198 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
199 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
200 $this->log_pull();
201 }
202 }
203
204
205
206
207
208
209
210
211 /****************************
212 *
213 * Backend API
214 *
215 ****************************/
216
217 /**
218 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
219 *
220 * @param integer The page uid, &id=
221 * @param integer The page type, &type=
222 * @param integer sys_language uid, typically &L=
223 * @param string The MP variable (Mount Points), &MP=
224 * @param array Rootline array of only UIDs.
225 * @param array Array of GET variables to register with this indexing
226 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
227 * @return void
228 */
229 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
230
231 // Setting up internal configuration from config array:
232 $this->conf = array();
233
234 // Information about page for which the indexing takes place
235 $this->conf['id'] = $id; // Page id (integer)
236 $this->conf['type'] = $type; // Page type (integer)
237 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
238 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
239 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
240
241 // cHash values:
242 $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : ''; // cHash string for additional parameters
243 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
244
245 // Set to defaults
246 $this->conf['freeIndexUid'] = 0;
247 $this->conf['freeIndexSetId'] = 0;
248 $this->conf['page_cache_reg1'] = '';
249
250 // Root line uids
251 $this->conf['rootline_uids'] = $uidRL;
252
253 // Configuration of behavior:
254 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
255 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
256 $this->conf['index_metatags'] = TRUE; // Whether to index document keywords and description (if present)
257
258 // Init and start indexing:
259 $this->init();
260 }
261
262 /**
263 * Sets the free-index uid. Can be called right after backend_initIndexer()
264 *
265 * @param integer Free index UID
266 * @param integer Set id - an integer identifying the "set" of indexing operations.
267 * @return void
268 */
269 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
270 $this->conf['freeIndexUid'] = $freeIndexUid;
271 $this->conf['freeIndexSetId'] = $freeIndexSetId;
272 }
273
274 /**
275 * Indexing records as the content of a TYPO3 page.
276 *
277 * @param string Title equivalent
278 * @param string Keywords equivalent
279 * @param string Description equivalent
280 * @param string The main content to index
281 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
282 * @param integer Last modification time, in seconds
283 * @param integer The creation date of the content, in seconds
284 * @param integer The record UID that the content comes from (for registration with the indexed rows)
285 * @return void
286 */
287 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
288
289 // Content of page:
290 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
291 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
292 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
293
294 // Construct fake HTML for parsing:
295 $this->conf['content'] = '
296 <html>
297 <head>
298 <title>'.htmlspecialchars($title).'</title>
299 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
300 <meta name="description" content="'.htmlspecialchars($description).'" />
301 </head>
302 <body>
303 '.htmlspecialchars($content).'
304 </body>
305 </html>'; // Content string (HTML of TYPO3 page)
306
307 // Initializing charset:
308 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
309 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
310
311 // Index content as if it was a TYPO3 page:
312 $this->indexTypo3PageContent();
313 }
314
315
316
317
318
319
320
321
322
323
324
325
326
327 /********************************
328 *
329 * Initialization
330 *
331 *******************************/
332
333 /**
334 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
335 *
336 * @return void
337 */
338 function init() {
339 global $TYPO3_CONF_VARS;
340
341 // Initializing:
342 $this->cHashParams = $this->conf['cHash_array'];
343 if (is_array($this->cHashParams) && count($this->cHashParams)) {
344 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
345 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
346 }
347
348 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
349 $this->setT3Hashes();
350
351 // Indexer configuration from Extension Manager interface:
352 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
353 $this->tstamp_minAge = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['minAge']*3600, 0);
354 $this->tstamp_maxAge = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['maxAge']*3600, 0);
355 $this->maxExternalFiles = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
356 $this->flagBitMask = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
357
358 // Workaround: If the extension configuration was not updated yet, the value is not existing
359 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
360
361 $this->storeMetaphoneInfoAsWords = tx_indexedsearch_util::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
362
363 // Initialize external document parsers:
364 // Example configuration, see ext_localconf.php of this file!
365 if ($this->conf['index_externals']) {
366 $this->initializeExternalParsers();
367 }
368
369 // Initialize lexer (class that deconstructs the text into words):
370 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
371 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
372 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
373 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
374 $this->lexerObj = t3lib_div::getUserObj($lexerObjRef);
375 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
376
377 // Initialize metaphone hook:
378 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
379 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
380 if ($this->enableMetaphoneSearch && $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
381 $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
382 $this->metaphoneObj->pObj = $this;
383 }
384
385 // Init charset class:
386 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
387 }
388
389 /**
390 * Initialize external parsers
391 *
392 * @return void
393 * @access private
394 * @see init()
395 */
396 function initializeExternalParsers() {
397 global $TYPO3_CONF_VARS;
398
399 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
400 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
401 $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef);
402 $this->external_parsers[$extension]->pObj = $this;
403
404 // Init parser and if it returns FALSE, unset its entry again:
405 if (!$this->external_parsers[$extension]->initParser($extension)) {
406 unset($this->external_parsers[$extension]);
407 }
408 }
409 }
410 }
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426 /********************************
427 *
428 * Indexing; TYPO3 pages (HTML content)
429 *
430 *******************************/
431
432 /**
433 * Start indexing of the TYPO3 page
434 *
435 * @return void
436 */
437 function indexTypo3PageContent() {
438
439 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
440 $is_grlist = $this->is_grlist_set($this->hash['phash']);
441
442 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
443
444 // Setting message:
445 if ($this->forceIndexing) {
446 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
447 } elseif ($check > 0) {
448 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
449 } else {
450 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
451 }
452
453 // Divide into title,keywords,description and body:
454 $this->log_push('Split content', '');
455 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
456 if ($this->conf['indexedDocTitle']) {
457 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
458 }
459 $this->log_pull();
460
461 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
462 $this->content_md5h = tx_indexedsearch_util::md5inthash(implode('', $this->contentParts));
463
464 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
465 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
466 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
467 $checkCHash = $this->checkContentHash();
468 if (!is_array($checkCHash) || $check===1) {
469 $Pstart=t3lib_div::milliseconds();
470
471 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
472 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
473 $this->log_pull();
474
475 // Splitting words
476 $this->log_push('Extract words from content', '');
477 $splitInWords = $this->processWordsInArrays($this->contentParts);
478 $this->log_pull();
479
480 // Analyse the indexed words.
481 $this->log_push('Analyse the extracted words', '');
482 $indexArr = $this->indexAnalyze($splitInWords);
483 $this->log_pull();
484
485 // Submitting page (phash) record
486 $this->log_push('Submitting page', '');
487 $this->submitPage();
488 $this->log_pull();
489
490 // Check words and submit to word list if not there
491 $this->log_push('Check word list and submit words', '');
492 if (tx_indexedsearch_util::isTableUsed('index_words')) {
493 $this->checkWordList($indexArr);
494 $this->submitWords($indexArr, $this->hash['phash']);
495 }
496 $this->log_pull();
497
498 // Set parsetime
499 $this->updateParsetime($this->hash['phash'], t3lib_div::milliseconds() - $Pstart);
500
501 // Checking external files if configured for.
502 $this->log_push('Checking external files', '');
503 if ($this->conf['index_externals']) {
504 $this->extractLinks($this->conf['content']);
505 }
506 $this->log_pull();
507 } else {
508 // Update the timestamp
509 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
510 $this->updateSetId($this->hash['phash']);
511 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
512 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
513 $this->updateRootline();
514 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
515 }
516 } else {
517 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
518 }
519 }
520
521 /**
522 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
523 *
524 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
525 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
526 * @see splitRegularContent()
527 */
528 function splitHTMLContent($content) {
529
530 // divide head from body ( u-ouh :) )
531 $contentArr = $this->defaultContentArray;
532 $contentArr['body'] = stristr($content, '<body');
533 $headPart = substr($content, 0, -strlen($contentArr['body']));
534
535 // get title
536 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
537 $titleParts = explode(':', $contentArr['title'], 2);
538 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
539
540 // get keywords and description metatags
541 if ($this->conf['index_metatags']) {
542 $meta = array();
543 $i = 0;
544 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
545 $i++;
546 }
547 // TODO The code below stops at first unset tag. Is that correct?
548 for ($i = 0; isset($meta[$i]); $i++) {
549 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
550 if (stristr($meta[$i]['name'], 'keywords')) {
551 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
552 }
553 if (stristr($meta[$i]['name'], 'description')) {
554 $contentArr['description'] .= ',' . $meta[$i]['content'];
555 }
556 }
557 }
558
559 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
560 $this->typoSearchTags($contentArr['body']);
561
562 // Get rid of unwanted sections (ie. scripting and style stuff) in body
563 $tagList = explode(',', $this->excludeSections);
564 foreach($tagList as $tag) {
565 while($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2));
566 }
567
568 // remove tags, but first make sure we don't concatenate words by doing it
569 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
570 $contentArr['body'] = trim(strip_tags($contentArr['body']));
571
572 $contentArr['keywords'] = trim($contentArr['keywords']);
573 $contentArr['description'] = trim($contentArr['description']);
574
575 // Return array
576 return $contentArr;
577 }
578
579 /**
580 * Extract the charset value from HTML meta tag.
581 *
582 * @param string HTML content
583 * @return string The charset value if found.
584 */
585 function getHTMLcharset($content) {
586 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
587 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
588 return $reg2[1];
589 }
590 }
591 }
592
593 /**
594 * Converts a HTML document to utf-8
595 *
596 * @param string HTML content, any charset
597 * @param string Optional charset (otherwise extracted from HTML)
598 * @return string Converted HTML
599 */
600 function convertHTMLToUtf8($content, $charset = '') {
601
602 // Find charset:
603 $charset = $charset ? $charset : $this->getHTMLcharset($content);
604 $charset = $this->csObj->parse_charset($charset);
605
606 // Convert charset:
607 if ($charset && $charset!=='utf-8') {
608 $content = $this->csObj->utf8_encode($content, $charset);
609 }
610 // Convert entities, assuming document is now UTF-8:
611 $content = $this->csObj->entities_to_utf8($content, TRUE);
612
613 return $content;
614 }
615
616 /**
617 * Finds first occurence of embracing tags and returns the embraced content and the original string with
618 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
619 * <title> of document or removing <script>-sections
620 *
621 * @param string String to search in
622 * @param string Tag name, eg. "script"
623 * @param string Passed by reference: Content inside found tag
624 * @param string Passed by reference: Content after found tag
625 * @param string Passed by reference: Attributes of the found tag.
626 * @return boolean Returns FALSE if tag was not found, otherwise TRUE.
627 */
628 function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
629 $endTag = '</'.$tagName.'>';
630 $startTag = '<'.$tagName;
631
632 // stristr used because we want a case-insensitive search for the tag.
633 $isTagInText = stristr($string, $startTag);
634 // if the tag was not found, return FALSE
635 if(!$isTagInText) return FALSE;
636
637 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
638 $afterTagInText = stristr($isTagInText, $endTag);
639 if ($afterTagInText) {
640 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
641 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
642 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
643 // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
644 } else {
645 $tagContent='';
646 $stringAfter = $isTagInText;
647 }
648
649 return TRUE;
650 }
651
652 /**
653 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
654 *
655 * @param string HTML Content, passed by reference
656 * @return boolean Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
657 */
658 function typoSearchTags(&$body) {
659 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/', $body);
660
661 if(count($expBody)>1) {
662 $body = '';
663
664 foreach($expBody as $val) {
665 $part = explode('-->', $val, 2);
666 if(trim($part[0])=='begin') {
667 $body.= $part[1];
668 $prev = '';
669 } elseif(trim($part[0])=='end') {
670 $body.= $prev;
671 } else {
672 $prev = $val;
673 }
674 }
675 return TRUE;
676 } else {
677 return FALSE;
678 }
679 }
680
681 /**
682 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
683 *
684 * @param string HTML content
685 * @return void
686 */
687 function extractLinks($content) {
688
689 // Get links:
690 $list = $this->extractHyperLinks($content);
691
692 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
693 $this->includeCrawlerClass();
694 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
695 }
696
697 // Traverse links:
698 foreach($list as $linkInfo) {
699
700 // Decode entities:
701 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
702 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
703 } else {
704 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
705 }
706
707 // Parse URL:
708 $qParts = parse_url($linkSource);
709
710 // Check for jumpurl (TYPO3 specific thing...)
711 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
712 parse_str($qParts['query'], $getP);
713 $linkSource = $getP['jumpurl'];
714 $qParts = parse_url($linkSource); // parse again due to new linkSource!
715 }
716
717 if (!$linkInfo['localPath'] && $qParts['scheme']) {
718 if ($this->indexerConfig['indexExternalURLs']) {
719 // Index external URL (http or otherwise)
720 $this->indexExternalUrl($linkSource);
721 }
722 } elseif (!$qParts['query']) {
723 $linkSource = urldecode($linkSource);
724 if (t3lib_div::isAllowedAbsPath($linkSource)) {
725 $localFile = $linkSource;
726 } else {
727 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
728 }
729 if ($localFile && @is_file($localFile)) {
730
731 // Index local file:
732 if ($linkInfo['localPath']) {
733
734 $fI = pathinfo($linkSource);
735 $ext = strtolower($fI['extension']);
736 if (is_object($crawler)) {
737 $params = array(
738 'document' => $linkSource,
739 'alturl' => $linkInfo['href'],
740 'conf' => $this->conf
741 );
742 unset($params['conf']['content']);
743
744 $crawler->addQueueEntry_callBack(0, $params, 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files', $this->conf['id']);
745 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
746 } else {
747 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
748 }
749 } else {
750 if (is_object($crawler)) {
751 $params = array(
752 'document' => $linkSource,
753 'conf' => $this->conf
754 );
755 unset($params['conf']['content']);
756 $crawler->addQueueEntry_callBack(0, $params, 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files', $this->conf['id']);
757 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
758 } else {
759 $this->indexRegularDocument($linkSource);
760 }
761 }
762 }
763 }
764 }
765 }
766
767 /**
768 * Extracts all links to external documents from the HTML content string
769 *
770 * @param string $html
771 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
772 * @see extractLinks()
773 */
774 function extractHyperLinks($html) {
775 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
776 $htmlParts = $htmlParser->splitTags('a', $html);
777 $hyperLinksData = array();
778 foreach ($htmlParts as $index => $tagData) {
779 if (($index % 2) !== 0) {
780 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
781 $firstTagName = $htmlParser->getFirstTagName($tagData);
782
783 if (strtolower($firstTagName) == 'a') {
784 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
785 $hyperLinksData[] = array(
786 'tag' => $tagData,
787 'href' => $tagAttributes[0]['href'],
788 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
789 );
790 }
791 }
792 }
793 }
794
795 return $hyperLinksData;
796 }
797
798 /**
799 * Extracts the "base href" from content string.
800 *
801 * @param string Content to analyze
802 * @return string The base href or an empty string if not found
803 */
804 public function extractBaseHref($html) {
805 $href = '';
806 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
807 $htmlParts = $htmlParser->splitTags('base', $html);
808 foreach ($htmlParts as $index => $tagData) {
809 if (($index % 2) !== 0) {
810 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
811 $firstTagName = $htmlParser->getFirstTagName($tagData);
812 if (strtolower($firstTagName) == 'base') {
813 $href = $tagAttributes[0]['href'];
814 if ($href) {
815 break;
816 }
817 }
818 }
819 }
820
821 return $href;
822 }
823
824 /******************************************
825 *
826 * Indexing; external URL
827 *
828 ******************************************/
829
830 /**
831 * Index External URLs HTML content
832 *
833 * @param string URL, eg. "http://typo3.org/"
834 * @return void
835 * @see indexRegularDocument()
836 */
837 function indexExternalUrl($externalUrl) {
838
839 // Parse External URL:
840 $qParts = parse_url($externalUrl);
841 $fI = pathinfo($qParts['path']);
842 $ext = strtolower($fI['extension']);
843
844 // Get headers:
845 $urlHeaders = $this->getUrlHeaders($externalUrl);
846 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
847 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
848 if (strlen($content)) {
849
850 // Create temporary file:
851 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL');
852 if ($tmpFile) {
853 t3lib_div::writeFile($tmpFile, $content);
854
855 // Index that file:
856 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
857 unlink($tmpFile);
858 }
859 }
860 }
861 }
862
863 /**
864 * Getting HTTP request headers of URL
865 *
866 * @param string The URL
867 * @param integer Timeout (seconds?)
868 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
869 */
870 function getUrlHeaders($url) {
871 // Try to get the headers only
872 $content = t3lib_div::getUrl($url, 2);
873
874 if (strlen($content)) {
875 // Compile headers:
876 $headers = t3lib_div::trimExplode(LF, $content, 1);
877 $retVal = array();
878 foreach($headers as $line) {
879 if (!strlen(trim($line))) {
880 break; // Stop at the first empty line (= end of header)
881 }
882
883 list($headKey, $headValue) = explode(':', $line, 2);
884 $retVal[$headKey] = $headValue;
885 }
886 return $retVal;
887 }
888 }
889
890
891
892 /**
893 * Checks if the file is local
894 *
895 * @param $sourcePath
896 * @return string Absolute path to file if file is local, else empty string
897 */
898 protected function createLocalPath($sourcePath) {
899 $localPath = '';
900 static $pathFunctions = array(
901 'createLocalPathFromT3vars',
902 'createLocalPathUsingAbsRefPrefix',
903 'createLocalPathUsingDomainURL',
904 'createLocalPathFromAbsoluteURL',
905 'createLocalPathFromRelativeURL'
906 );
907 foreach ($pathFunctions as $functionName) {
908 $localPath = $this->$functionName($sourcePath);
909 if ($localPath != '') {
910 break;
911 }
912 }
913 return $localPath;
914 }
915
916 /**
917 * Attempts to create a local file path from T3VARs. This is useful for
918 * various download extensions that hide actual file name but still want the
919 * file to be indexed.
920 *
921 * @param string $sourcePath
922 * @return string
923 */
924 protected function createLocalPathFromT3vars($sourcePath) {
925 $localPath = '';
926 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
927 if (is_array($indexLocalFiles)) {
928 $md5 = t3lib_div::shortMD5($sourcePath);
929 // Note: not using self::isAllowedLocalFile here because this method
930 // is allowed to index files outside of the web site (for example,
931 // protected downloads)
932 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
933 $localPath = $indexLocalFiles[$md5];
934 }
935 }
936 return $localPath;
937 }
938
939 /**
940 * Attempts to create a local file path by matching a current request URL.
941 *
942 * @param string $sourcePath
943 * @return string
944 */
945 protected function createLocalPathUsingDomainURL($sourcePath) {
946 $localPath = '';
947 $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
948 $baseURLLength = strlen($baseURL);
949 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
950 $sourcePath = substr($sourcePath, $baseURLLength);
951 $localPath = PATH_site . $sourcePath;
952 if (!self::isAllowedLocalFile($localPath)) {
953 $localPath = '';
954 }
955 }
956 return $localPath;
957 }
958
959 /**
960 * Attempts to create a local file path by matching absRefPrefix. This
961 * requires TSFE. If TSFE is missing, this function does nothing.
962 *
963 * @param string $sourcePath
964 * @return string
965 */
966 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
967 $localPath = '';
968 if ($GLOBALS['TSFE'] instanceof tslib_fe) {
969 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
970 $absRefPrefixLength = strlen($absRefPrefix);
971 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
972 $sourcePath = substr($sourcePath, $absRefPrefixLength);
973 $localPath = PATH_site . $sourcePath;
974 if (!self::isAllowedLocalFile($localPath)) {
975 $localPath = '';
976 }
977 }
978 }
979 return $localPath;
980 }
981
982 /**
983 * Attempts to create a local file path from the absolute URL without
984 * schema.
985 *
986 * @param string $sourcePath
987 * @return string
988 */
989 protected function createLocalPathFromAbsoluteURL($sourcePath) {
990 $localPath = '';
991 if ($sourcePath{0} == '/') {
992 $sourcePath = substr($sourcePath, 1);
993 $localPath = PATH_site . $sourcePath;
994 if (!self::isAllowedLocalFile($localPath)) {
995 $localPath = '';
996 }
997 }
998 return $localPath;
999 }
1000
1001 /**
1002 * Attempts to create a local file path from the relative URL.
1003 *
1004 * @param string $sourcePath
1005 * @return string
1006 */
1007 protected function createLocalPathFromRelativeURL($sourcePath) {
1008 $localPath = '';
1009 if (self::isRelativeURL($sourcePath)) {
1010 $localPath = PATH_site . $sourcePath;
1011 if (!self::isAllowedLocalFile($localPath)) {
1012 $localPath = '';
1013 }
1014 }
1015 return $localPath;
1016 }
1017
1018 /**
1019 * Checks if URL is relative.
1020 *
1021 * @param string $url
1022 * @return boolean
1023 */
1024 static protected function isRelativeURL($url) {
1025 $urlParts = @parse_url($url);
1026 return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
1027 }
1028
1029 /**
1030 * Checks if the path points to the file inside the web site
1031 *
1032 * @param string $filePath
1033 * @return boolean
1034 */
1035 static protected function isAllowedLocalFile($filePath) {
1036 $filePath = t3lib_div::resolveBackPath($filePath);
1037 $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
1038 $isFile = is_file($filePath);
1039 return $insideWebPath && $isFile;
1040 }
1041
1042 /******************************************
1043 *
1044 * Indexing; external files (PDF, DOC, etc)
1045 *
1046 ******************************************/
1047
1048 /**
1049 * Indexing a regular document given as $file (relative to PATH_site, local file)
1050 *
1051 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1052 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1053 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1054 * @param string File extension for temporary file.
1055 * @return void
1056 */
1057 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
1058
1059 // Init
1060 $fI = pathinfo($file);
1061 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
1062
1063 // Create abs-path:
1064 if (!$contentTmpFile) {
1065 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
1066 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
1067 } else { // Absolute, pass-through:
1068 $absFile = $file;
1069 }
1070 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
1071 } else {
1072 $absFile = $contentTmpFile;
1073 }
1074
1075 // Indexing the document:
1076 if ($absFile && @is_file($absFile)) {
1077 if ($this->external_parsers[$ext]) {
1078 $mtime = filemtime($absFile);
1079 $cParts = $this->fileContentParts($ext, $absFile);
1080
1081 foreach($cParts as $cPKey) {
1082 $this->internal_log = array();
1083 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1084 $Pstart = t3lib_div::milliseconds();
1085 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1086 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file, $subinfo);
1087 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1088 if ($check > 0 || $force) {
1089 if ($check > 0) {
1090 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1091 } else {
1092 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1093 }
1094
1095 // Check external file counter:
1096 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1097
1098 // Divide into title,keywords,description and body:
1099 $this->log_push('Split content', '');
1100 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1101 $this->log_pull();
1102
1103 if (is_array($contentParts)) {
1104 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1105 $content_md5h = tx_indexedsearch_util::md5inthash(implode($contentParts, ''));
1106
1107 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1108
1109 // Increment counter:
1110 $this->externalFileCounter++;
1111
1112 // Splitting words
1113 $this->log_push('Extract words from content', '');
1114 $splitInWords = $this->processWordsInArrays($contentParts);
1115 $this->log_pull();
1116
1117 // Analyse the indexed words.
1118 $this->log_push('Analyse the extracted words', '');
1119 $indexArr = $this->indexAnalyze($splitInWords);
1120 $this->log_pull();
1121
1122 // Submitting page (phash) record
1123 $this->log_push('Submitting page', '');
1124 $size = filesize($absFile);
1125 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1126 $ctime = filemtime($absFile);
1127 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts);
1128 $this->log_pull();
1129
1130 // Check words and submit to word list if not there
1131 $this->log_push('Check word list and submit words', '');
1132 if (tx_indexedsearch_util::isTableUsed('index_words')) {
1133 $this->checkWordList($indexArr);
1134 $this->submitWords($indexArr, $phash_arr['phash']);
1135 }
1136 $this->log_pull();
1137
1138 // Set parsetime
1139 $this->updateParsetime($phash_arr['phash'], t3lib_div::milliseconds() - $Pstart);
1140 } else {
1141 // Update the timestamp
1142 $this->updateTstamp($phash_arr['phash'], $mtime);
1143 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1144 }
1145 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1146 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1147 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1148
1149 // Checking and setting sections:
1150 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1151 $this->log_pull();
1152 }
1153 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1154 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1155 }
1156
1157 /**
1158 * Reads the content of an external file being indexed.
1159 * The content from the external parser MUST be returned in utf-8!
1160 *
1161 * @param string File extension, eg. "pdf", "doc" etc.
1162 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1163 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1164 * @return array Standard content array (title, description, keywords, body keys)
1165 */
1166 function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1167 $contentArray = NULL;
1168
1169 // Consult relevant external document parser:
1170 if (is_object($this->external_parsers[$fileExtension])) {
1171 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1172 }
1173
1174 return $contentArray;
1175 }
1176
1177 /**
1178 * Creates an array with pointers to divisions of document.
1179 *
1180 * @param string File extension
1181 * @param string Absolute filename (must exist and be validated OK before calling function)
1182 * @return array Array of pointers to sections that the document should be divided into
1183 */
1184 function fileContentParts($ext, $absFile) {
1185 $cParts = array(0);
1186
1187 // Consult relevant external document parser:
1188 if (is_object($this->external_parsers[$ext])) {
1189 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1190 }
1191
1192 return $cParts;
1193 }
1194
1195 /**
1196 * Splits non-HTML content (from external files for instance)
1197 *
1198 * @param string Input content (non-HTML) to index.
1199 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1200 * @see splitHTMLContent()
1201 */
1202 function splitRegularContent($content) {
1203 $contentArr = $this->defaultContentArray;
1204 $contentArr['body'] = $content;
1205
1206 return $contentArr;
1207 }
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222 /**********************************
1223 *
1224 * Analysing content, Extracting words
1225 *
1226 **********************************/
1227
1228 /**
1229 * Convert character set and HTML entities in the value of input content array keys
1230 *
1231 * @param array Standard content array
1232 * @param string Charset of the input content (converted to utf-8)
1233 * @return void
1234 */
1235 function charsetEntity2utf8(&$contentArr, $charset) {
1236
1237 // Convert charset if necessary
1238 foreach ($contentArr as $key => $value) {
1239 if (strlen($contentArr[$key])) {
1240
1241 if ($charset!=='utf-8') {
1242 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1243 }
1244
1245 // decode all numeric / html-entities in the string to real characters:
1246 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1247 }
1248 }
1249 }
1250
1251 /**
1252 * Processing words in the array from split*Content -functions
1253 *
1254 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1255 * @return array Content input array modified so each key is not a unique array of words
1256 */
1257 function processWordsInArrays($contentArr) {
1258
1259 // split all parts to words
1260 foreach ($contentArr as $key => $value) {
1261 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1262 }
1263
1264 // For title, keywords, and description we don't want duplicates:
1265 $contentArr['title'] = array_unique($contentArr['title']);
1266 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1267 $contentArr['description'] = array_unique($contentArr['description']);
1268
1269 // Return modified array:
1270 return $contentArr;
1271 }
1272
1273 /**
1274 * Extracts the sample description text from the content array.
1275 *
1276 * @param array Content array
1277 * @return string Description string
1278 */
1279 function bodyDescription($contentArr) {
1280
1281 // Setting description
1282 $maxL = t3lib_utility_Math::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1283 if ($maxL) {
1284 $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1285
1286 // Shorten the string:
1287 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1288 }
1289
1290 return $bodyDescription;
1291 }
1292
1293 /**
1294 * Analyzes content to use for indexing,
1295 *
1296 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1297 * @return array Index Array (whatever that is...)
1298 */
1299 function indexAnalyze($content) {
1300 $indexArr = Array();
1301 $counter = 0;
1302
1303 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1304 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1305 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1306 $this->analyzeBody($indexArr, $content);
1307
1308 return $indexArr;
1309 }
1310
1311 /**
1312 * Calculates relevant information for headercontent
1313 *
1314 * @param array Index array, passed by reference
1315 * @param array Standard content array
1316 * @param string Key from standard content array
1317 * @param integer Bit-wise priority to type
1318 * @return void
1319 */
1320 function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1321 foreach ($content[$key] as $val) {
1322 $val = substr($val, 0, 60); // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1323
1324 if (!isset($retArr[$val])) {
1325 // Word ID (wid)
1326 $retArr[$val]['hash'] = tx_indexedsearch_util::md5inthash($val);
1327
1328 // Metaphone value is also 60 only chars long
1329 $metaphone = $this->enableMetaphoneSearch
1330 ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60)
1331 : '';
1332 $retArr[$val]['metaphone'] = $metaphone;
1333 }
1334
1335 // Build metaphone fulltext string (can be used for fulltext indexing)
1336 if ($this->storeMetaphoneInfoAsWords) {
1337 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1338 }
1339
1340 // Priority used for flagBitMask feature (see extension configuration)
1341 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2, $offset);
1342
1343 // Increase number of occurences
1344 $retArr[$val]['count']++;
1345 $this->wordcount++;
1346 }
1347 }
1348
1349 /**
1350 * Calculates relevant information for bodycontent
1351 *
1352 * @param array Index array, passed by reference
1353 * @param array Standard content array
1354 * @return void
1355 */
1356 function analyzeBody(&$retArr, $content) {
1357 foreach ($content['body'] as $key => $val) {
1358 $val = substr($val, 0, 60); // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1359
1360 if (!isset($retArr[$val])) {
1361 // First occurence (used for ranking results)
1362 $retArr[$val]['first'] = $key;
1363
1364 // Word ID (wid)
1365 $retArr[$val]['hash'] = tx_indexedsearch_util::md5inthash($val);
1366
1367 // Metaphone value is also only 60 chars long
1368 $metaphone = $this->enableMetaphoneSearch
1369 ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60)
1370 : '';
1371 $retArr[$val]['metaphone'] = $metaphone;
1372 }
1373
1374 // Build metaphone fulltext string (can be used for fulltext indexing)
1375 if ($this->storeMetaphoneInfoAsWords) {
1376 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1377 }
1378
1379 // Increase number of occurences
1380 $retArr[$val]['count']++;
1381 $this->wordcount++;
1382 }
1383 }
1384
1385 /**
1386 * Creating metaphone based hash from input word
1387 *
1388 * @param string Word to convert
1389 * @param boolean If set, returns the raw metaphone value (not hashed)
1390 * @return mixed Metaphone hash integer (or raw value, string)
1391 */
1392 function metaphone($word, $returnRawMetaphoneValue=FALSE) {
1393
1394 if (is_object($this->metaphoneObj)) {
1395 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1396 } else {
1397 // Use native PHP function instead of advanced doubleMetaphone class
1398 $metaphoneRawValue = metaphone($word);
1399 }
1400
1401 if ($returnRawMetaphoneValue) {
1402 $result = $metaphoneRawValue;
1403 } elseif (strlen($metaphoneRawValue)) {
1404 // Create hash and return integer
1405 $result = tx_indexedsearch_util::md5inthash($metaphoneRawValue);
1406 } else {
1407 $result = 0;
1408 }
1409
1410 return $result;
1411 }
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428 /********************************
1429 *
1430 * SQL; TYPO3 Pages
1431 *
1432 *******************************/
1433
1434 /**
1435 * Updates db with information about the page (TYPO3 page, not external media)
1436 *
1437 * @return void
1438 */
1439 function submitPage() {
1440
1441 // Remove any current data for this phash:
1442 $this->removeOldIndexedPages($this->hash['phash']);
1443
1444 // setting new phash_row
1445 $fields = array(
1446 'phash' => $this->hash['phash'],
1447 'phash_grouping' => $this->hash['phash_grouping'],
1448 'cHashParams' => serialize($this->cHashParams),
1449 'contentHash' => $this->content_md5h,
1450 'data_page_id' => $this->conf['id'],
1451 'data_page_reg1' => $this->conf['page_cache_reg1'],
1452 'data_page_type' => $this->conf['type'],
1453 'data_page_mp' => $this->conf['MP'],
1454 'gr_list' => $this->conf['gr_list'],
1455 'item_type' => 0, // TYPO3 page
1456 'item_title' => $this->contentParts['title'],
1457 'item_description' => $this->bodyDescription($this->contentParts),
1458 'item_mtime' => $this->conf['mtime'],
1459 'item_size' => strlen($this->conf['content']),
1460 'tstamp' => $GLOBALS['EXEC_TIME'],
1461 'crdate' => $GLOBALS['EXEC_TIME'],
1462 'item_crdate' => $this->conf['crdate'], // Creation date of page
1463 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1464 'externalUrl' => 0,
1465 'recordUid' => intval($this->conf['recordUid']),
1466 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1467 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1468 );
1469
1470 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1471 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1472 }
1473
1474 // PROCESSING index_section
1475 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1476
1477 // PROCESSING index_grlist
1478 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1479
1480 // PROCESSING index_fulltext
1481 $fields = array(
1482 'phash' => $this->hash['phash'],
1483 'fulltextdata' => implode(' ', $this->contentParts),
1484 'metaphonedata' => $this->metaphoneContent
1485 );
1486 if ($this->indexerConfig['fullTextDataLength']>0) {
1487 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1488 }
1489 if (tx_indexedsearch_util::isTableUsed('index_fulltext')) {
1490 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1491 }
1492
1493 // PROCESSING index_debug
1494 if ($this->indexerConfig['debugMode']) {
1495 $fields = array(
1496 'phash' => $this->hash['phash'],
1497 'debuginfo' => serialize(array(
1498 'cHashParams' => $this->cHashParams,
1499 'external_parsers initialized' => array_keys($this->external_parsers),
1500 'conf' => array_merge($this->conf, array('content'=>substr($this->conf['content'], 0, 1000))),
1501 'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1502 'logs' => $this->internal_log,
1503 'lexer' => $this->lexerObj->debugString,
1504 ))
1505 );
1506 if (tx_indexedsearch_util::isTableUsed('index_debug')) {
1507 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1508 }
1509 }
1510 }
1511
1512 /**
1513 * Stores gr_list in the database.
1514 *
1515 * @param integer Search result record phash
1516 * @param integer Actual phash of current content
1517 * @return void
1518 * @see update_grlist()
1519 */
1520 function submit_grlist($hash, $phash_x) {
1521
1522 // Setting the gr_list record
1523 $fields = array(
1524 'phash' => $hash,
1525 'phash_x' => $phash_x,
1526 'hash_gr_list' => tx_indexedsearch_util::md5inthash($this->conf['gr_list']),
1527 'gr_list' => $this->conf['gr_list']
1528 );
1529 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1530 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1531 }
1532 }
1533
1534 /**
1535 * Stores section
1536 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1537 *
1538 * @param integer phash of TYPO3 parent search result record
1539 * @param integer phash of the file indexation search record
1540 * @return void
1541 */
1542 function submit_section($hash, $hash_t3) {
1543 $fields = array(
1544 'phash' => $hash,
1545 'phash_t3' => $hash_t3,
1546 'page_id' => intval($this->conf['id'])
1547 );
1548
1549 $this->getRootLineFields($fields);
1550
1551 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1552 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1553 }
1554 }
1555
1556 /**
1557 * Removes records for the indexed page, $phash
1558 *
1559 * @param integer phash value to flush
1560 * @return void
1561 */
1562 function removeOldIndexedPages($phash) {
1563 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1564 $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1565 foreach ($tableArray as $table) {
1566 if (tx_indexedsearch_util::isTableUsed($table)) {
1567 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1568 }
1569 }
1570 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1571 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1572 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . intval($phash));
1573 }
1574 }
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588 /********************************
1589 *
1590 * SQL; External media
1591 *
1592 *******************************/
1593
1594
1595 /**
1596 * Updates db with information about the file
1597 *
1598 * @param array Array with phash and phash_grouping keys for file
1599 * @param string File name
1600 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1601 * @param string File extension determining the type of media.
1602 * @param integer Modification time of file.
1603 * @param integer Creation time of file.
1604 * @param integer Size of file in bytes
1605 * @param integer Content HASH value.
1606 * @param array Standard content array (using only title and body for a file)
1607 * @return void
1608 */
1609 function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1610
1611 // Find item Type:
1612 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1613 $storeItemType = $storeItemType ? $storeItemType : $ext;
1614
1615 // Remove any current data for this phash:
1616 $this->removeOldIndexedFiles($hash['phash']);
1617
1618 // Split filename:
1619 $fileParts = parse_url($file);
1620
1621 // Setting new
1622 $fields = array(
1623 'phash' => $hash['phash'],
1624 'phash_grouping' => $hash['phash_grouping'],
1625 'cHashParams' => serialize($subinfo),
1626 'contentHash' => $content_md5h,
1627 'data_filename' => $file,
1628 'item_type' => $storeItemType,
1629 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1630 'item_description' => $this->bodyDescription($contentParts),
1631 'item_mtime' => $mtime,
1632 'item_size' => $size,
1633 'item_crdate' => $ctime,
1634 'tstamp' => $GLOBALS['EXEC_TIME'],
1635 'crdate' => $GLOBALS['EXEC_TIME'],
1636 'gr_list' => $this->conf['gr_list'],
1637 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1638 'recordUid' => intval($this->conf['recordUid']),
1639 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1640 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1641 );
1642 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1643 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1644 }
1645
1646 // PROCESSING index_fulltext
1647 $fields = array(
1648 'phash' => $hash['phash'],
1649 'fulltextdata' => implode(' ', $contentParts),
1650 'metaphonedata' => $this->metaphoneContent
1651 );
1652 if ($this->indexerConfig['fullTextDataLength']>0) {
1653 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1654 }
1655 if (tx_indexedsearch_util::isTableUsed('index_fulltext')) {
1656 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1657 }
1658
1659 // PROCESSING index_debug
1660 if ($this->indexerConfig['debugMode']) {
1661 $fields = array(
1662 'phash' => $hash['phash'],
1663 'debuginfo' => serialize(array(
1664 'cHashParams' => $subinfo,
1665 'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1666 'logs' => $this->internal_log,
1667 'lexer' => $this->lexerObj->debugString,
1668 ))
1669 );
1670 if (tx_indexedsearch_util::isTableUsed('index_debug')) {
1671 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1672 }
1673 }
1674 }
1675
1676 /**
1677 * Stores file gr_list for a file IF it does not exist already
1678 *
1679 * @param integer phash value of file
1680 * @return void
1681 */
1682 function submitFile_grlist($hash) {
1683 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1684 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1685 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($hash) . ' AND (hash_gr_list=' . tx_indexedsearch_util::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . tx_indexedsearch_util::md5inthash($this->conf['gr_list']) . ')');
1686 if ($count == 0) {
1687 $this->submit_grlist($hash, $hash);
1688 }
1689 }
1690 }
1691
1692 /**
1693 * Stores file section for a file IF it does not exist
1694 *
1695 * @param integer phash value of file
1696 * @return void
1697 */
1698 function submitFile_section($hash) {
1699 // Testing if there is already a section
1700 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1701 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . intval($hash) . ' AND page_id=' . intval($this->conf['id']));
1702 if ($count == 0) {
1703 $this->submit_section($hash, $this->hash['phash']);
1704 }
1705 }
1706 }
1707
1708 /**
1709 * Removes records for the indexed page, $phash
1710 *
1711 * @param integer phash value to flush
1712 * @return void
1713 */
1714 function removeOldIndexedFiles($phash) {
1715 // Removing old registrations for tables.
1716 $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1717 foreach ($tableArray as $table) {
1718 if (tx_indexedsearch_util::isTableUsed($table)) {
1719 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1720 }
1721 }
1722 }
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737 /********************************
1738 *
1739 * SQL Helper functions
1740 *
1741 *******************************/
1742
1743 /**
1744 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1745 * Return positive integer if the page needs to be indexed
1746 *
1747 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1748 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1749 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1750 */
1751 function checkMtimeTstamp($mtime, $phash) {
1752 if (!tx_indexedsearch_util::isTableUsed('index_phash')) {
1753 // Not indexed (not in index_phash)
1754 $result = 4;
1755 }
1756 else {
1757 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . intval($phash));
1758
1759 // If there was an indexing of the page...:
1760 if ($row) {
1761 if ($this->tstamp_maxAge && ($row['tstamp'] + $this->tstamp_maxAge) < $GLOBALS['EXEC_TIME']) {
1762 // If max age is exceeded, index the page
1763 // The configured max-age was exceeded for the document and thus it's indexed.
1764 $result = 1;
1765 } else {
1766 if (!$this->tstamp_minAge || ($row['tstamp'] + $this->tstamp_minAge) < $GLOBALS['EXEC_TIME']) {
1767 // if minAge is not set or if minAge is exceeded, consider at mtime
1768 if ($mtime) {
1769 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1770 if ($row['item_mtime'] != $mtime) {
1771 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1772 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1773 $result = 2;
1774 } else {
1775 // mtime matched the document, so no changes detected and no content updated
1776 $result = -1;
1777 if ($this->tstamp_maxAge) {
1778 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1779 } else {
1780 $this->updateTstamp($phash);
1781 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1782 }
1783 }
1784 } else {
1785 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1786 $result = 3;
1787 }
1788 } else {
1789 // The minimum age was not exceeded
1790 $result = -2;
1791 }
1792 }
1793 } else {
1794 // Page has never been indexed (is not represented in the index_phash table).
1795 $result = 4;
1796 }
1797 }
1798 return $result;
1799 }
1800
1801 /**
1802 * Check content hash in phash table
1803 *
1804 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1805 */
1806 function checkContentHash() {
1807 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1808 $result = TRUE;
1809 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1810 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping='.intval($this->hash['phash_grouping']).' AND contentHash='.intval($this->content_md5h));
1811 if ($row) {
1812 $result = $row;
1813 }
1814 }
1815
1816 return $result;
1817 }
1818
1819 /**
1820 * Check content hash for external documents
1821 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1822 *
1823 * @param integer phash value to check (phash_grouping)
1824 * @param integer Content hash to check
1825 * @return boolean Returns TRUE if the document needs to be indexed (that is, there was no result)
1826 */
1827 function checkExternalDocContentHash($hashGr, $content_md5h) {
1828 $result = TRUE;
1829 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1830 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . intval($hashGr) . ' AND contentHash=' . intval($content_md5h));
1831 $result = ($count == 0);
1832 }
1833
1834 return $result;
1835 }
1836
1837 /**
1838 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1839 *
1840 * @param integer Phash integer to test.
1841 * @return boolean
1842 */
1843 function is_grlist_set($phash_x) {
1844 $result = FALSE;
1845 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1846 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . intval($phash_x));
1847 $result = ($count > 0);
1848 }
1849 return $result;
1850 }
1851
1852 /**
1853 * Check if an grlist-entry for this hash exists and if not so, write one.
1854 *
1855 * @param integer phash of the search result that should be found
1856 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1857 * @return void
1858 * @see submit_grlist()
1859 */
1860 function update_grlist($phash, $phash_x) {
1861 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1862 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($phash) . ' AND hash_gr_list=' . tx_indexedsearch_util::md5inthash($this->conf['gr_list']));
1863 if ($count == 0) {
1864 $this->submit_grlist($phash, $phash_x);
1865 $this->log_setTSlogMessage("Inserted gr_list '" . $this->conf['gr_list'] . "' for phash '" . $phash . "'", 1);
1866 }
1867 }
1868 }
1869
1870 /**
1871 * Update tstamp for a phash row.
1872 *
1873 * @param integer phash value
1874 * @param integer If set, update the mtime field to this value.
1875 * @return void
1876 */
1877 function updateTstamp($phash, $mtime = 0) {
1878 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1879 $updateFields = array(
1880 'tstamp' => $GLOBALS['EXEC_TIME']
1881 );
1882 if ($mtime) {
1883 $updateFields['item_mtime'] = intval($mtime);
1884 }
1885 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1886 }
1887 }
1888
1889 /**
1890 * Update SetID of the index_phash record.
1891 *
1892 * @param integer phash value
1893 * @return void
1894 */
1895 function updateSetId($phash) {
1896 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1897 $updateFields = array(
1898 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1899 );
1900 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1901 }
1902 }
1903
1904 /**
1905 * Update parsetime for phash row.
1906 *
1907 * @param integer phash value.
1908 * @param integer Parsetime value to set.
1909 * @return void
1910 */
1911 function updateParsetime($phash, $parsetime) {
1912 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1913 $updateFields = array(
1914 'parsetime' => intval($parsetime)
1915 );
1916 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1917 }
1918 }
1919
1920 /**
1921 * Update section rootline for the page
1922 *
1923 * @return void
1924 */
1925 function updateRootline() {
1926 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1927 $updateFields = array();
1928 $this->getRootLineFields($updateFields);
1929 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . intval($this->conf['id']), $updateFields);
1930 }
1931 }
1932
1933 /**
1934 * Adding values for root-line fields.
1935 * rl0, rl1 and rl2 are standard. A hook might add more.
1936 *
1937 * @param array Field array, passed by reference
1938 * @return void
1939 */
1940 function getRootLineFields(array &$fieldArray) {
1941 $fieldArray['rl0'] = intval($this->conf['rootline_uids'][0]);
1942 $fieldArray['rl1'] = intval($this->conf['rootline_uids'][1]);
1943 $fieldArray['rl2'] = intval($this->conf['rootline_uids'][2]);
1944
1945 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1946 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1947 $fieldArray[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1948 }
1949 }
1950 }
1951
1952 /**
1953 * Removes any indexed pages with userlogins which has the same contentHash
1954 * NOT USED anywhere inside this class!
1955 *
1956 * @return void
1957 */
1958 function removeLoginpagesWithContentHash() {
1959 if (tx_indexedsearch_util::isTableUsed('index_phash') && tx_indexedsearch_util::isTableUsed('index_grlist')) {
1960 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1961 A.phash=B.phash
1962 AND A.phash_grouping='.intval($this->hash['phash_grouping']) . '
1963 AND B.hash_gr_list<>'.tx_indexedsearch_util::md5inthash($this->defaultGrList) . '
1964 AND A.contentHash='.intval($this->content_md5h));
1965 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1966 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1967 $this->removeOldIndexedPages($row['phash']);
1968 }
1969 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1970 }
1971 }
1972
1973 /**
1974 * Includes the crawler class
1975 *
1976 * @return void
1977 */
1978 function includeCrawlerClass() {
1979 t3lib_div::requireOnce(t3lib_extMgm::extPath('crawler') . 'class.tx_crawler_lib.php');
1980 }
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991 /********************************
1992 *
1993 * SQL; Submitting words
1994 *
1995 *******************************/
1996
1997 /**
1998 * Adds new words to db
1999 *
2000 * @param array $wordListArray Word List array (where each word has information about position etc).
2001 * @return void
2002 */
2003 function checkWordList($wordListArray) {
2004 if (tx_indexedsearch_util::isTableUsed('index_words')) {
2005 if (count($wordListArray)) {
2006 $phashArray = array();
2007 foreach ($wordListArray as $value) {
2008 $phashArray[] = intval($value['hash']);
2009 }
2010 $cwl = implode(',', $phashArray);
2011 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
2012 if ($count != count($wordListArray)) {
2013 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
2014 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
2015 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
2016 unset($wordListArray[$row['baseword']]);
2017 }
2018 $GLOBALS['TYPO3_DB']->sql_free_result($res);
2019
2020 foreach ($wordListArray as $key => $val) {
2021 $insertFields = array(
2022 'wid' => $val['hash'],
2023 'baseword' => $key,
2024 'metaphone' => $val['metaphone']
2025 );
2026 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
2027 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
2028 }
2029 }
2030 }
2031
2032 }
2033 }
2034
2035 /**
2036 * Submits RELATIONS between words and phash
2037 *
2038 * @param array Word list array
2039 * @param integer phash value
2040 * @return void
2041 */
2042 function submitWords($wordList, $phash) {
2043 if (tx_indexedsearch_util::isTableUsed('index_rel')) {
2044 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . intval($phash));
2045
2046 foreach ($wordList as $val) {
2047 $insertFields = array(
2048 'phash' => $phash,
2049 'wid' => $val['hash'],
2050 'count' => $val['count'],
2051 'first' => $val['first'],
2052 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
2053 'flags' => ($val['cmp'] & $this->flagBitMask)
2054 );
2055
2056 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2057 }
2058 }
2059 }
2060
2061 /**
2062 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2063 * and back.
2064 *
2065 * @param double Frequency
2066 * @return integer Frequency in range.
2067 */
2068 function freqMap($freq) {
2069 $mapFactor = $this->freqMax*100*$this->freqRange;
2070 if ($freq < 1) {
2071 $newFreq = $freq*$mapFactor;
2072 $newFreq = $newFreq>$this->freqRange ? $this->freqRange : $newFreq;
2073 } else {
2074 $newFreq = $freq/$mapFactor;
2075 }
2076 return $newFreq;
2077 }
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089 /********************************
2090 *
2091 * Hashing
2092 *
2093 *******************************/
2094
2095 /**
2096 * Get search hash, T3 pages
2097 *
2098 * @return void
2099 */
2100 function setT3Hashes() {
2101
2102 // Set main array:
2103 $hArray = array(
2104 'id' => (integer)$this->conf['id'],
2105 'type' => (integer)$this->conf['type'],
2106 'sys_lang' => (integer)$this->conf['sys_language_uid'],
2107 'MP' => (string)$this->conf['MP'],
2108 'cHash' => $this->cHashParams
2109 );
2110
2111 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2112 $this->hash['phash_grouping'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2113
2114 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2115 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2116 $this->hash['phash'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2117 }
2118
2119 /**
2120 * Get search hash, external files
2121 *
2122 * @param string File name / path which identifies it on the server
2123 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2124 * @return array Array with "phash_grouping" and "phash" inside.
2125 */
2126 function setExtHashes($file, $subinfo = array()) {
2127 // Set main array:
2128 $hash = array();
2129 $hArray = array(
2130 'file' => $file,
2131 );
2132
2133 // Set grouping hash:
2134 $hash['phash_grouping'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2135
2136 // Add subinfo
2137 $hArray['subinfo'] = $subinfo;
2138 $hash['phash'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2139
2140 return $hash;
2141 }
2142
2143 /*********************************
2144 *
2145 * Internal logging functions
2146 *
2147 *********************************/
2148
2149 /**
2150 * Push function wrapper for TT logging
2151 *
2152 * @param string Title to set
2153 * @param string Key (?)
2154 * @return void
2155 */
2156 function log_push($msg, $key) {
2157 if (is_object($GLOBALS['TT'])) {
2158 $GLOBALS['TT']->push($msg, $key);
2159 }
2160 }
2161
2162 /**
2163 * Pull function wrapper for TT logging
2164 *
2165 * @return void
2166 */
2167 function log_pull() {
2168 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
2169 }
2170
2171 /**
2172 * Set log message function wrapper for TT logging
2173 *
2174 * @param string Message to set
2175 * @param integer Error number
2176 * @return void
2177 */
2178 function log_setTSlogMessage($msg, $errorNum=0) {
2179 if (is_object($GLOBALS['TT'])) {
2180 $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2181 }
2182 $this->internal_log[] = $msg;
2183 }
2184
2185
2186
2187
2188
2189
2190
2191
2192 /**************************
2193 *
2194 * tslib_fe hooks:
2195 *
2196 **************************/
2197
2198 /**
2199 * Makes sure that keywords are space-separated. This is impotant for their
2200 * proper displaying as a part of fulltext index.
2201 *
2202 * @param string $keywordList
2203 * @return string
2204 * @see http://bugs.typo3.org/view.php?id=1436
2205 */
2206 protected function addSpacesToKeywordList($keywordList) {
2207 $keywords = t3lib_div::trimExplode(',', $keywordList);
2208 return ' ' . implode(', ', $keywords) . ' ';
2209 }
2210 }
2211 ?>