[TASK] Remove function index
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * Indexing class for TYPO3 frontend
35 *
36 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
37 * @package TYPO3
38 * @subpackage tx_indexedsearch
39 */
40 class tx_indexedsearch_indexer {
41
42 // Messages:
43 var $reasons = array(
44 -1 => 'mtime matched the document, so no changes detected and no content updated',
45 -2 => 'The minimum age was not exceeded',
46 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
47 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
48 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
49 4 => 'Page has never been indexed (is not represented in the index_phash table).'
50 );
51
52 // HTML code blocks to exclude from indexing:
53 var $excludeSections = 'script,style';
54
55 // Supported Extensions for external files:
56 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
57
58 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
59 var $defaultGrList = '0,-1';
60
61 // Min/Max times:
62 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
63 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
64 var $maxExternalFiles = 0; // Max number of external files to index.
65
66 var $forceIndexing = FALSE; // If TRUE, indexing is forced despite of hashes etc.
67 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
68
69 // INTERNALS:
70 var $defaultContentArray=array(
71 'title' => '',
72 'description' => '',
73 'keywords' => '',
74 'body' => '',
75 );
76 var $wordcount = 0;
77 var $externalFileCounter = 0;
78
79 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
80 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
81 var $hash = array(); // Hash array, contains phash and phash_grouping
82 var $file_phash_arr = array(); // Hash array for files
83 var $contentParts = array(); // Content of TYPO3 page
84 var $content_md5h = '';
85 var $internal_log = array(); // Internal log
86 var $indexExternalUrl_content = '';
87
88 var $cHashParams = array(); // cHashparams array
89
90 var $freqRange = 32000;
91 var $freqMax = 0.1;
92
93 // Objects:
94 /**
95 * Charset class object
96 *
97 * @var t3lib_cs
98 */
99 var $csObj;
100
101 /**
102 * Metaphone object, if any
103 *
104 * @var user_DoubleMetaPhone
105 */
106 var $metaphoneObj;
107
108 /**
109 * Lexer object for word splitting
110 *
111 * @var tx_indexedsearch_lexer
112 */
113 var $lexerObj;
114
115
116
117 /**
118 * Parent Object (TSFE) Initialization
119 *
120 * @param object Parent Object (frontend TSFE object), passed by reference
121 * @return void
122 */
123 function hook_indexContent(&$pObj) {
124
125 // Indexer configuration from Extension Manager interface:
126 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
127
128 // Crawler activation:
129 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
130 if (t3lib_extMgm::isLoaded('crawler')
131 && $pObj->applicationData['tx_crawler']['running']
132 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
133
134 // Setting simple log message:
135 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
136
137 // Setting variables:
138 $this->crawlerActive = TRUE; // Crawler active flag
139 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
140 }
141
142 // Determine if page should be indexed, and if so, configure and initialize indexer
143 if ($pObj->config['config']['index_enable']) {
144 $this->log_push('Index page','');
145
146 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
147 if (!$pObj->page['no_search']) {
148 if (!$pObj->no_cache) {
149 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
150
151 // Setting up internal configuration from config array:
152 $this->conf = array();
153
154 // Information about page for which the indexing takes place
155 $this->conf['id'] = $pObj->id; // Page id
156 $this->conf['type'] = $pObj->type; // Page type
157 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
158 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
159 $this->conf['gr_list'] = $pObj->gr_list; // Group list
160
161 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
162 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
163
164 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
165 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
166
167 // Root line uids
168 $this->conf['rootline_uids'] = array();
169 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
170 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
171 }
172
173 // Content of page:
174 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
175 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
176 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
177 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
178
179 // Configuration of behavior:
180 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
181 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
182 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
183
184 // Set to zero:
185 $this->conf['recordUid'] = 0;
186 $this->conf['freeIndexUid'] = 0;
187 $this->conf['freeIndexSetId'] = 0;
188
189 // Init and start indexing:
190 $this->init();
191 $this->indexTypo3PageContent();
192 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
193 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
194 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
195 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
196 $this->log_pull();
197 }
198 }
199
200
201
202
203
204
205
206
207 /****************************
208 *
209 * Backend API
210 *
211 ****************************/
212
213 /**
214 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
215 *
216 * @param integer The page uid, &id=
217 * @param integer The page type, &type=
218 * @param integer sys_language uid, typically &L=
219 * @param string The MP variable (Mount Points), &MP=
220 * @param array Rootline array of only UIDs.
221 * @param array Array of GET variables to register with this indexing
222 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
223 * @return void
224 */
225 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
226
227 // Setting up internal configuration from config array:
228 $this->conf = array();
229
230 // Information about page for which the indexing takes place
231 $this->conf['id'] = $id; // Page id (integer)
232 $this->conf['type'] = $type; // Page type (integer)
233 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
234 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
235 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
236
237 // cHash values:
238 $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : ''; // cHash string for additional parameters
239 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
240
241 // Set to defaults
242 $this->conf['freeIndexUid'] = 0;
243 $this->conf['freeIndexSetId'] = 0;
244 $this->conf['page_cache_reg1'] = '';
245
246 // Root line uids
247 $this->conf['rootline_uids'] = $uidRL;
248
249 // Configuration of behavior:
250 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
251 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
252 $this->conf['index_metatags'] = TRUE; // Whether to index document keywords and description (if present)
253
254 // Init and start indexing:
255 $this->init();
256 }
257
258 /**
259 * Sets the free-index uid. Can be called right after backend_initIndexer()
260 *
261 * @param integer Free index UID
262 * @param integer Set id - an integer identifying the "set" of indexing operations.
263 * @return void
264 */
265 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
266 $this->conf['freeIndexUid'] = $freeIndexUid;
267 $this->conf['freeIndexSetId'] = $freeIndexSetId;
268 }
269
270 /**
271 * Indexing records as the content of a TYPO3 page.
272 *
273 * @param string Title equivalent
274 * @param string Keywords equivalent
275 * @param string Description equivalent
276 * @param string The main content to index
277 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
278 * @param integer Last modification time, in seconds
279 * @param integer The creation date of the content, in seconds
280 * @param integer The record UID that the content comes from (for registration with the indexed rows)
281 * @return void
282 */
283 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
284
285 // Content of page:
286 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
287 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
288 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
289
290 // Construct fake HTML for parsing:
291 $this->conf['content'] = '
292 <html>
293 <head>
294 <title>'.htmlspecialchars($title).'</title>
295 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
296 <meta name="description" content="'.htmlspecialchars($description).'" />
297 </head>
298 <body>
299 '.htmlspecialchars($content).'
300 </body>
301 </html>'; // Content string (HTML of TYPO3 page)
302
303 // Initializing charset:
304 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
305 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
306
307 // Index content as if it was a TYPO3 page:
308 $this->indexTypo3PageContent();
309 }
310
311
312
313
314
315
316
317
318
319
320
321
322
323 /********************************
324 *
325 * Initialization
326 *
327 *******************************/
328
329 /**
330 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
331 *
332 * @return void
333 */
334 function init() {
335 global $TYPO3_CONF_VARS;
336
337 // Initializing:
338 $this->cHashParams = $this->conf['cHash_array'];
339 if (is_array($this->cHashParams) && count($this->cHashParams)) {
340 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
341 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
342 }
343
344 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
345 $this->setT3Hashes();
346
347 // Indexer configuration from Extension Manager interface:
348 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
349 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
350 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
351 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
352 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
353
354 // Initialize external document parsers:
355 // Example configuration, see ext_localconf.php of this file!
356 if ($this->conf['index_externals']) {
357 $this->initializeExternalParsers();
358 }
359
360 // Initialize lexer (class that deconstructs the text into words):
361 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
362 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
363 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
364 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
365 $this->lexerObj = t3lib_div::getUserObj($lexerObjRef);
366 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
367
368 // Initialize metaphone hook:
369 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
370 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
371 $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
372 $this->metaphoneObj->pObj = $this;
373 }
374
375 // Init charset class:
376 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
377 }
378
379 /**
380 * Initialize external parsers
381 *
382 * @return void
383 * @access private
384 * @see init()
385 */
386 function initializeExternalParsers() {
387 global $TYPO3_CONF_VARS;
388
389 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
390 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
391 $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef);
392 $this->external_parsers[$extension]->pObj = $this;
393
394 // Init parser and if it returns FALSE, unset its entry again:
395 if (!$this->external_parsers[$extension]->initParser($extension)) {
396 unset($this->external_parsers[$extension]);
397 }
398 }
399 }
400 }
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416 /********************************
417 *
418 * Indexing; TYPO3 pages (HTML content)
419 *
420 *******************************/
421
422 /**
423 * Start indexing of the TYPO3 page
424 *
425 * @return void
426 */
427 function indexTypo3PageContent() {
428
429 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
430 $is_grlist = $this->is_grlist_set($this->hash['phash']);
431
432 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
433
434 // Setting message:
435 if ($this->forceIndexing) {
436 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
437 } elseif ($check > 0) {
438 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
439 } else {
440 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
441 }
442
443 // Divide into title,keywords,description and body:
444 $this->log_push('Split content','');
445 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
446 if ($this->conf['indexedDocTitle']) {
447 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
448 }
449 $this->log_pull();
450
451 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
452 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
453
454 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
455 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
456 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
457 $checkCHash = $this->checkContentHash();
458 if (!is_array($checkCHash) || $check===1) {
459 $Pstart=t3lib_div::milliseconds();
460
461 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
462 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
463 $this->log_pull();
464
465 // Splitting words
466 $this->log_push('Extract words from content','');
467 $splitInWords = $this->processWordsInArrays($this->contentParts);
468 $this->log_pull();
469
470 // Analyse the indexed words.
471 $this->log_push('Analyse the extracted words','');
472 $indexArr = $this->indexAnalyze($splitInWords);
473 $this->log_pull();
474
475 // Submitting page (phash) record
476 $this->log_push('Submitting page','');
477 $this->submitPage();
478 $this->log_pull();
479
480 // Check words and submit to word list if not there
481 $this->log_push('Check word list and submit words','');
482 $this->checkWordList($indexArr);
483 $this->submitWords($indexArr,$this->hash['phash']);
484 $this->log_pull();
485
486 // Set parsetime
487 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
488
489 // Checking external files if configured for.
490 $this->log_push('Checking external files','');
491 if ($this->conf['index_externals']) {
492 $this->extractLinks($this->conf['content']);
493 }
494 $this->log_pull();
495 } else {
496 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
497 $this->updateSetId($this->hash['phash']);
498 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
499 $this->updateRootline();
500 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
501 }
502 } else {
503 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
504 }
505 }
506
507 /**
508 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
509 *
510 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
511 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
512 * @see splitRegularContent()
513 */
514 function splitHTMLContent($content) {
515
516 // divide head from body ( u-ouh :) )
517 $contentArr = $this->defaultContentArray;
518 $contentArr['body'] = stristr($content,'<body');
519 $headPart = substr($content,0,-strlen($contentArr['body']));
520
521 // get title
522 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
523 $titleParts = explode(':',$contentArr['title'],2);
524 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
525
526 // get keywords and description metatags
527 if($this->conf['index_metatags']) {
528 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
529 for($i=0;isset($meta[$i]);$i++) {
530 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
531 if (stristr($meta[$i]['name'], 'keywords')) {
532 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
533 }
534 if (stristr($meta[$i]['name'], 'description')) {
535 $contentArr['description'] .= ',' . $meta[$i]['content'];
536 }
537 }
538 }
539
540 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
541 $this->typoSearchTags($contentArr['body']);
542
543 // Get rid of unwanted sections (ie. scripting and style stuff) in body
544 $tagList = explode(',',$this->excludeSections);
545 foreach($tagList as $tag) {
546 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
547 }
548
549 // remove tags, but first make sure we don't concatenate words by doing it
550 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
551 $contentArr['body'] = trim(strip_tags($contentArr['body']));
552
553 $contentArr['keywords'] = trim($contentArr['keywords']);
554 $contentArr['description'] = trim($contentArr['description']);
555
556 // Return array
557 return $contentArr;
558 }
559
560 /**
561 * Extract the charset value from HTML meta tag.
562 *
563 * @param string HTML content
564 * @return string The charset value if found.
565 */
566 function getHTMLcharset($content) {
567 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i',$content,$reg)) {
568 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i',$reg[0],$reg2)) {
569 return $reg2[1];
570 }
571 }
572 }
573
574 /**
575 * Converts a HTML document to utf-8
576 *
577 * @param string HTML content, any charset
578 * @param string Optional charset (otherwise extracted from HTML)
579 * @return string Converted HTML
580 */
581 function convertHTMLToUtf8($content,$charset='') {
582
583 // Find charset:
584 $charset = $charset ? $charset : $this->getHTMLcharset($content);
585 $charset = $this->csObj->parse_charset($charset);
586
587 // Convert charset:
588 if ($charset && $charset!=='utf-8') {
589 $content = $this->csObj->utf8_encode($content, $charset);
590 }
591 // Convert entities, assuming document is now UTF-8:
592 $content = $this->csObj->entities_to_utf8($content, TRUE);
593
594 return $content;
595 }
596
597 /**
598 * Finds first occurence of embracing tags and returns the embraced content and the original string with
599 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
600 * <title> of document or removing <script>-sections
601 *
602 * @param string String to search in
603 * @param string Tag name, eg. "script"
604 * @param string Passed by reference: Content inside found tag
605 * @param string Passed by reference: Content after found tag
606 * @param string Passed by reference: Attributes of the found tag.
607 * @return boolean Returns FALSE if tag was not found, otherwise TRUE.
608 */
609 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
610 $endTag = '</'.$tagName.'>';
611 $startTag = '<'.$tagName;
612
613 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
614 if(!$isTagInText) return FALSE; // if the tag was not found, return FALSE
615
616 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
617 $afterTagInText = stristr($isTagInText,$endTag);
618 if ($afterTagInText) {
619 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
620 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
621 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
622 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
623 $tagContent='';
624 $stringAfter = $isTagInText;
625 }
626
627 return TRUE;
628 }
629
630 /**
631 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
632 *
633 * @param string HTML Content, passed by reference
634 * @return boolean Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
635 */
636 function typoSearchTags(&$body) {
637 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
638
639 if(count($expBody)>1) {
640 $body = '';
641
642 foreach($expBody as $val) {
643 $part = explode('-->',$val,2);
644 if(trim($part[0])=='begin') {
645 $body.= $part[1];
646 $prev = '';
647 } elseif(trim($part[0])=='end') {
648 $body.= $prev;
649 } else {
650 $prev = $val;
651 }
652 }
653 return TRUE;
654 } else {
655 return FALSE;
656 }
657 }
658
659 /**
660 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
661 *
662 * @param string HTML content
663 * @return void
664 */
665 function extractLinks($content) {
666
667 // Get links:
668 $list = $this->extractHyperLinks($content);
669
670 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
671 $this->includeCrawlerClass();
672 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
673 }
674
675 // Traverse links:
676 foreach($list as $linkInfo) {
677
678 // Decode entities:
679 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
680 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
681 } else {
682 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
683 }
684
685 // Parse URL:
686 $qParts = parse_url($linkSource);
687
688 // Check for jumpurl (TYPO3 specific thing...)
689 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
690 parse_str($qParts['query'],$getP);
691 $linkSource = $getP['jumpurl'];
692 $qParts = parse_url($linkSource); // parse again due to new linkSource!
693 }
694
695 if (!$linkInfo['localPath'] && $qParts['scheme']) {
696 if ($this->indexerConfig['indexExternalURLs']) {
697 // Index external URL (http or otherwise)
698 $this->indexExternalUrl($linkSource);
699 }
700 } elseif (!$qParts['query']) {
701 $linkSource = urldecode($linkSource);
702 if (t3lib_div::isAllowedAbsPath($linkSource)) {
703 $localFile = $linkSource;
704 } else {
705 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
706 }
707 if ($localFile && @is_file($localFile)) {
708
709 // Index local file:
710 if ($linkInfo['localPath']) {
711
712 $fI = pathinfo($linkSource);
713 $ext = strtolower($fI['extension']);
714 if (is_object($crawler)) {
715 $params = array(
716 'document' => $linkSource,
717 'alturl' => $linkInfo['href'],
718 'conf' => $this->conf
719 );
720 unset($params['conf']['content']);
721
722 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
723 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
724 } else {
725 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
726 }
727 } else {
728 if (is_object($crawler)) {
729 $params = array(
730 'document' => $linkSource,
731 'conf' => $this->conf
732 );
733 unset($params['conf']['content']);
734 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
735 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
736 } else {
737 $this->indexRegularDocument($linkSource);
738 }
739 }
740 }
741 }
742 }
743 }
744
745 /**
746 * Extracts all links to external documents from the HTML content string
747 *
748 * @param string $html
749 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
750 * @see extractLinks()
751 */
752 function extractHyperLinks($html) {
753 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
754 $htmlParts = $htmlParser->splitTags('a', $html);
755 $hyperLinksData = array();
756 foreach ($htmlParts as $index => $tagData) {
757 if (($index % 2) !== 0) {
758 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
759 $firstTagName = $htmlParser->getFirstTagName($tagData);
760
761 if (strtolower($firstTagName) == 'a') {
762 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
763 $hyperLinksData[] = array(
764 'tag' => $tagData,
765 'href' => $tagAttributes[0]['href'],
766 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
767 );
768 }
769 }
770 }
771 }
772
773 return $hyperLinksData;
774 }
775
776 /**
777 * Extracts the "base href" from content string.
778 *
779 * @param string Content to analyze
780 * @return string The base href or an empty string if not found
781 */
782 public function extractBaseHref($html) {
783 $href = '';
784 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
785 $htmlParts = $htmlParser->splitTags('base', $html);
786 foreach ($htmlParts as $index => $tagData) {
787 if (($index % 2) !== 0) {
788 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
789 $firstTagName = $htmlParser->getFirstTagName($tagData);
790 if (strtolower($firstTagName) == 'base') {
791 $href = $tagAttributes[0]['href'];
792 if ($href) {
793 break;
794 }
795 }
796 }
797 }
798
799 return $href;
800 }
801
802 /******************************************
803 *
804 * Indexing; external URL
805 *
806 ******************************************/
807
808 /**
809 * Index External URLs HTML content
810 *
811 * @param string URL, eg. "http://typo3.org/"
812 * @return void
813 * @see indexRegularDocument()
814 */
815 function indexExternalUrl($externalUrl) {
816
817 // Parse External URL:
818 $qParts = parse_url($externalUrl);
819 $fI = pathinfo($qParts['path']);
820 $ext = strtolower($fI['extension']);
821
822 // Get headers:
823 $urlHeaders = $this->getUrlHeaders($externalUrl);
824 if (stristr($urlHeaders['Content-Type'],'text/html')) {
825 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
826 if (strlen($content)) {
827
828 // Create temporary file:
829 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL');
830 if ($tmpFile) {
831 t3lib_div::writeFile($tmpFile, $content);
832
833 // Index that file:
834 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
835 unlink($tmpFile);
836 }
837 }
838 }
839 }
840
841 /**
842 * Getting HTTP request headers of URL
843 *
844 * @param string The URL
845 * @param integer Timeout (seconds?)
846 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
847 */
848 function getUrlHeaders($url) {
849 $content = t3lib_div::getUrl($url,2); // Try to get the headers only
850
851 if (strlen($content)) {
852 // Compile headers:
853 $headers = t3lib_div::trimExplode(LF,$content,1);
854 $retVal = array();
855 foreach($headers as $line) {
856 if (!strlen(trim($line))) {
857 break; // Stop at the first empty line (= end of header)
858 }
859
860 list($headKey, $headValue) = explode(':', $line, 2);
861 $retVal[$headKey] = $headValue;
862 }
863 return $retVal;
864 }
865 }
866
867
868
869 /**
870 * Checks if the file is local
871 *
872 * @param $sourcePath
873 * @return string Absolute path to file if file is local, else empty string
874 */
875 protected function createLocalPath($sourcePath) {
876 $localPath = '';
877 static $pathFunctions = array(
878 'createLocalPathFromT3vars',
879 'createLocalPathUsingAbsRefPrefix',
880 'createLocalPathUsingDomainURL',
881 'createLocalPathFromAbsoluteURL',
882 'createLocalPathFromRelativeURL'
883 );
884 foreach ($pathFunctions as $functionName) {
885 $localPath = $this->$functionName($sourcePath);
886 if ($localPath != '') {
887 break;
888 }
889 }
890 return $localPath;
891 }
892
893 /**
894 * Attempts to create a local file path from T3VARs. This is useful for
895 * various download extensions that hide actual file name but still want the
896 * file to be indexed.
897 *
898 * @param string $sourcePath
899 * @return string
900 */
901 protected function createLocalPathFromT3vars($sourcePath) {
902 $localPath = '';
903 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
904 if (is_array($indexLocalFiles)) {
905 $md5 = t3lib_div::shortMD5($sourcePath);
906 // Note: not using self::isAllowedLocalFile here because this method
907 // is allowed to index files outside of the web site (for example,
908 // protected downloads)
909 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
910 $localPath = $indexLocalFiles[$md5];
911 }
912 }
913 return $localPath;
914 }
915
916 /**
917 * Attempts to create a local file path by matching a current request URL.
918 *
919 * @param string $sourcePath
920 * @return string
921 */
922 protected function createLocalPathUsingDomainURL($sourcePath) {
923 $localPath = '';
924 $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
925 $baseURLLength = strlen($baseURL);
926 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
927 $sourcePath = substr($sourcePath, $baseURLLength);
928 $localPath = PATH_site . $sourcePath;
929 if (!self::isAllowedLocalFile($localPath)) {
930 $localPath = '';
931 }
932 }
933 return $localPath;
934 }
935
936 /**
937 * Attempts to create a local file path by matching absRefPrefix. This
938 * requires TSFE. If TSFE is missing, this function does nothing.
939 *
940 * @param string $sourcePath
941 * @return string
942 */
943 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
944 $localPath = '';
945 if ($GLOBALS['TSFE'] instanceof tslib_fe) {
946 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
947 $absRefPrefixLength = strlen($absRefPrefix);
948 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
949 $sourcePath = substr($sourcePath, $absRefPrefixLength);
950 $localPath = PATH_site . $sourcePath;
951 if (!self::isAllowedLocalFile($localPath)) {
952 $localPath = '';
953 }
954 }
955 }
956 return $localPath;
957 }
958
959 /**
960 * Attempts to create a local file path from the absolute URL without
961 * schema.
962 *
963 * @param string $sourcePath
964 * @return string
965 */
966 protected function createLocalPathFromAbsoluteURL($sourcePath) {
967 $localPath = '';
968 if ($sourcePath{0} == '/') {
969 $sourcePath = substr($sourcePath, 1);
970 $localPath = PATH_site . $sourcePath;
971 if (!self::isAllowedLocalFile($localPath)) {
972 $localPath = '';
973 }
974 }
975 return $localPath;
976 }
977
978 /**
979 * Attempts to create a local file path from the relative URL.
980 *
981 * @param string $sourcePath
982 * @return string
983 */
984 protected function createLocalPathFromRelativeURL($sourcePath) {
985 $localPath = '';
986 if (self::isRelativeURL($sourcePath)) {
987 $localPath = PATH_site . $sourcePath;
988 if (!self::isAllowedLocalFile($localPath)) {
989 $localPath = '';
990 }
991 }
992 return $localPath;
993 }
994
995 /**
996 * Checks if URL is relative.
997 *
998 * @param string $url
999 * @return boolean
1000 */
1001 static protected function isRelativeURL($url) {
1002 $urlParts = @parse_url($url);
1003 return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
1004 }
1005
1006 /**
1007 * Checks if the path points to the file inside the web site
1008 *
1009 * @param string $filePath
1010 * @return boolean
1011 */
1012 static protected function isAllowedLocalFile($filePath) {
1013 $filePath = t3lib_div::resolveBackPath($filePath);
1014 $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
1015 $isFile = is_file($filePath);
1016 return $insideWebPath && $isFile;
1017 }
1018
1019 /******************************************
1020 *
1021 * Indexing; external files (PDF, DOC, etc)
1022 *
1023 ******************************************/
1024
1025 /**
1026 * Indexing a regular document given as $file (relative to PATH_site, local file)
1027 *
1028 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1029 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1030 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1031 * @param string File extension for temporary file.
1032 * @return void
1033 */
1034 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
1035
1036 // Init
1037 $fI = pathinfo($file);
1038 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
1039
1040 // Create abs-path:
1041 if (!$contentTmpFile) {
1042 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
1043 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
1044 } else { // Absolute, pass-through:
1045 $absFile = $file;
1046 }
1047 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
1048 } else {
1049 $absFile = $contentTmpFile;
1050 }
1051
1052 // Indexing the document:
1053 if ($absFile && @is_file($absFile)) {
1054 if ($this->external_parsers[$ext]) {
1055 $mtime = filemtime($absFile);
1056 $cParts = $this->fileContentParts($ext,$absFile);
1057
1058 foreach($cParts as $cPKey) {
1059 $this->internal_log = array();
1060 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
1061 $Pstart = t3lib_div::milliseconds();
1062 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1063 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
1064 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1065 if ($check > 0 || $force) {
1066 if ($check > 0) {
1067 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
1068 } else {
1069 $this->log_setTSlogMessage('Indexing forced by flag',1);
1070 }
1071
1072 // Check external file counter:
1073 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1074
1075 // Divide into title,keywords,description and body:
1076 $this->log_push('Split content','');
1077 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
1078 $this->log_pull();
1079
1080 if (is_array($contentParts)) {
1081 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1082 $content_md5h = $this->md5inthash(implode($contentParts,''));
1083
1084 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1085
1086 // Increment counter:
1087 $this->externalFileCounter++;
1088
1089 // Splitting words
1090 $this->log_push('Extract words from content','');
1091 $splitInWords = $this->processWordsInArrays($contentParts);
1092 $this->log_pull();
1093
1094 // Analyse the indexed words.
1095 $this->log_push('Analyse the extracted words','');
1096 $indexArr = $this->indexAnalyze($splitInWords);
1097 $this->log_pull();
1098
1099 // Submitting page (phash) record
1100 $this->log_push('Submitting page','');
1101 $size = filesize($absFile);
1102 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1103 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
1104 $this->log_pull();
1105
1106 // Check words and submit to word list if not there
1107 $this->log_push('Check word list and submit words','');
1108 $this->checkWordList($indexArr);
1109 $this->submitWords($indexArr,$phash_arr['phash']);
1110 $this->log_pull();
1111
1112 // Set parsetime
1113 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
1114 } else {
1115 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
1116 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
1117 }
1118 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1119 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1120 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1121
1122 // Checking and setting sections:
1123 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
1124 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1125 $this->log_pull();
1126 }
1127 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1128 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1129 }
1130
1131 /**
1132 * Reads the content of an external file being indexed.
1133 * The content from the external parser MUST be returned in utf-8!
1134 *
1135 * @param string File extension, eg. "pdf", "doc" etc.
1136 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1137 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1138 * @return array Standard content array (title, description, keywords, body keys)
1139 */
1140 function readFileContent($ext,$absFile,$cPKey) {
1141
1142 // Consult relevant external document parser:
1143 if (is_object($this->external_parsers[$ext])) {
1144 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
1145 }
1146
1147 return $contentArr;
1148 }
1149
1150 /**
1151 * Creates an array with pointers to divisions of document.
1152 *
1153 * @param string File extension
1154 * @param string Absolute filename (must exist and be validated OK before calling function)
1155 * @return array Array of pointers to sections that the document should be divided into
1156 */
1157 function fileContentParts($ext,$absFile) {
1158 $cParts = array(0);
1159
1160 // Consult relevant external document parser:
1161 if (is_object($this->external_parsers[$ext])) {
1162 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
1163 }
1164
1165 return $cParts;
1166 }
1167
1168 /**
1169 * Splits non-HTML content (from external files for instance)
1170 *
1171 * @param string Input content (non-HTML) to index.
1172 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1173 * @see splitHTMLContent()
1174 */
1175 function splitRegularContent($content) {
1176 $contentArr = $this->defaultContentArray;
1177 $contentArr['body'] = $content;
1178
1179 return $contentArr;
1180 }
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195 /**********************************
1196 *
1197 * Analysing content, Extracting words
1198 *
1199 **********************************/
1200
1201 /**
1202 * Convert character set and HTML entities in the value of input content array keys
1203 *
1204 * @param array Standard content array
1205 * @param string Charset of the input content (converted to utf-8)
1206 * @return void
1207 */
1208 function charsetEntity2utf8(&$contentArr, $charset) {
1209
1210 // Convert charset if necessary
1211 foreach ($contentArr as $key => $value) {
1212 if (strlen($contentArr[$key])) {
1213
1214 if ($charset!=='utf-8') {
1215 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1216 }
1217
1218 // decode all numeric / html-entities in the string to real characters:
1219 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
1220 }
1221 }
1222 }
1223
1224 /**
1225 * Processing words in the array from split*Content -functions
1226 *
1227 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1228 * @return array Content input array modified so each key is not a unique array of words
1229 */
1230 function processWordsInArrays($contentArr) {
1231
1232 // split all parts to words
1233 foreach ($contentArr as $key => $value) {
1234 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1235 }
1236
1237 // For title, keywords, and description we don't want duplicates:
1238 $contentArr['title'] = array_unique($contentArr['title']);
1239 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1240 $contentArr['description'] = array_unique($contentArr['description']);
1241
1242 // Return modified array:
1243 return $contentArr;
1244 }
1245
1246 /**
1247 * Extracts the sample description text from the content array.
1248 *
1249 * @param array Content array
1250 * @return string Description string
1251 */
1252 function bodyDescription($contentArr) {
1253
1254 // Setting description
1255 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
1256 if ($maxL) {
1257 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
1258 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
1259 $bodyDescription = str_replace(array(' ',TAB,CR,LF),' ',$contentArr['body']);
1260
1261 // Shorten the string:
1262 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1263 }
1264
1265 return $bodyDescription;
1266 }
1267
1268 /**
1269 * Analyzes content to use for indexing,
1270 *
1271 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1272 * @return array Index Array (whatever that is...)
1273 */
1274 function indexAnalyze($content) {
1275 $indexArr = Array();
1276 $counter = 0;
1277
1278 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
1279 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
1280 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
1281 $this->analyzeBody($indexArr,$content);
1282
1283 return ($indexArr);
1284 }
1285
1286 /**
1287 * Calculates relevant information for headercontent
1288 *
1289 * @param array Index array, passed by reference
1290 * @param array Standard content array
1291 * @param string Key from standard content array
1292 * @param integer Bit-wise priority to type
1293 * @return void
1294 */
1295 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
1296 foreach ($content[$key] as $val) {
1297 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1298 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
1299 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1300 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1301 $retArr[$val]['metaphone'] = $this->metaphone($val);
1302 $this->wordcount++;
1303 }
1304 }
1305
1306 /**
1307 * Calculates relevant information for bodycontent
1308 *
1309 * @param array Index array, passed by reference
1310 * @param array Standard content array
1311 * @return void
1312 */
1313 function analyzeBody(&$retArr,$content) {
1314 foreach($content['body'] as $key => $val) {
1315 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1316 if(!isset($retArr[$val])) {
1317 $retArr[$val]['first'] = $key;
1318 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1319 $retArr[$val]['metaphone'] = $this->metaphone($val);
1320 }
1321 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1322 $this->wordcount++;
1323 }
1324 }
1325
1326 /**
1327 * Creating metaphone based hash from input word
1328 *
1329 * @param string Word to convert
1330 * @param boolean If set, returns the raw metaphone value (not hashed)
1331 * @return mixed Metaphone hash integer (or raw value, string)
1332 */
1333 function metaphone($word,$retRaw=FALSE) {
1334
1335 if (is_object($this->metaphoneObj)) {
1336 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1337 } else {
1338 $tmp = metaphone($word);
1339 }
1340
1341 // Return raw value?
1342 if ($retRaw) return $tmp;
1343
1344 // Otherwise create hash and return integer
1345 if ($tmp == '') {
1346 $ret = 0;
1347 } else {
1348 $ret = hexdec(substr(md5($tmp), 0, 7));
1349 }
1350 return $ret;
1351 }
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368 /********************************
1369 *
1370 * SQL; TYPO3 Pages
1371 *
1372 *******************************/
1373
1374 /**
1375 * Updates db with information about the page (TYPO3 page, not external media)
1376 *
1377 * @return void
1378 */
1379 function submitPage() {
1380
1381 // Remove any current data for this phash:
1382 $this->removeOldIndexedPages($this->hash['phash']);
1383
1384 // setting new phash_row
1385 $fields = array(
1386 'phash' => $this->hash['phash'],
1387 'phash_grouping' => $this->hash['phash_grouping'],
1388 'cHashParams' => serialize($this->cHashParams),
1389 'contentHash' => $this->content_md5h,
1390 'data_page_id' => $this->conf['id'],
1391 'data_page_reg1' => $this->conf['page_cache_reg1'],
1392 'data_page_type' => $this->conf['type'],
1393 'data_page_mp' => $this->conf['MP'],
1394 'gr_list' => $this->conf['gr_list'],
1395 'item_type' => 0, // TYPO3 page
1396 'item_title' => $this->contentParts['title'],
1397 'item_description' => $this->bodyDescription($this->contentParts),
1398 'item_mtime' => $this->conf['mtime'],
1399 'item_size' => strlen($this->conf['content']),
1400 'tstamp' => $GLOBALS['EXEC_TIME'],
1401 'crdate' => $GLOBALS['EXEC_TIME'],
1402 'item_crdate' => $this->conf['crdate'], // Creation date of page
1403 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1404 'externalUrl' => 0,
1405 'recordUid' => intval($this->conf['recordUid']),
1406 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1407 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1408 );
1409
1410 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1411
1412 // PROCESSING index_section
1413 $this->submit_section($this->hash['phash'],$this->hash['phash']);
1414
1415 // PROCESSING index_grlist
1416 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
1417
1418 // PROCESSING index_fulltext
1419 $fields = array(
1420 'phash' => $this->hash['phash'],
1421 'fulltextdata' => implode(' ', $this->contentParts)
1422 );
1423 if ($this->indexerConfig['fullTextDataLength']>0) {
1424 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1425 }
1426 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1427
1428 // PROCESSING index_debug
1429 if ($this->indexerConfig['debugMode']) {
1430 $fields = array(
1431 'phash' => $this->hash['phash'],
1432 'debuginfo' => serialize(array(
1433 'cHashParams' => $this->cHashParams,
1434 'external_parsers initialized' => array_keys($this->external_parsers),
1435 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
1436 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
1437 'logs' => $this->internal_log,
1438 'lexer' => $this->lexerObj->debugString,
1439 ))
1440 );
1441 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1442 }
1443 }
1444
1445 /**
1446 * Stores gr_list in the database.
1447 *
1448 * @param integer Search result record phash
1449 * @param integer Actual phash of current content
1450 * @return void
1451 * @see update_grlist()
1452 */
1453 function submit_grlist($hash,$phash_x) {
1454
1455 // Setting the gr_list record
1456 $fields = array(
1457 'phash' => $hash,
1458 'phash_x' => $phash_x,
1459 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
1460 'gr_list' => $this->conf['gr_list']
1461 );
1462 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1463 }
1464
1465 /**
1466 * Stores section
1467 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1468 *
1469 * @param integer phash of TYPO3 parent search result record
1470 * @param integer phash of the file indexation search record
1471 * @return void
1472 */
1473 function submit_section($hash,$hash_t3) {
1474 $fields = array(
1475 'phash' => $hash,
1476 'phash_t3' => $hash_t3,
1477 'page_id' => intval($this->conf['id'])
1478 );
1479
1480 $this->getRootLineFields($fields);
1481
1482 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1483 }
1484
1485 /**
1486 * Removes records for the indexed page, $phash
1487 *
1488 * @param integer phash value to flush
1489 * @return void
1490 */
1491 function removeOldIndexedPages($phash) {
1492 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1493 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
1494 foreach($tableArr as $table) {
1495 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1496 }
1497 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1498 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
1499 }
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513 /********************************
1514 *
1515 * SQL; External media
1516 *
1517 *******************************/
1518
1519
1520 /**
1521 * Updates db with information about the file
1522 *
1523 * @param array Array with phash and phash_grouping keys for file
1524 * @param string File name
1525 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1526 * @param string File extension determining the type of media.
1527 * @param integer Modification time of file.
1528 * @param integer Creation time of file.
1529 * @param integer Size of file in bytes
1530 * @param integer Content HASH value.
1531 * @param array Standard content array (using only title and body for a file)
1532 * @return void
1533 */
1534 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
1535
1536 // Find item Type:
1537 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1538 $storeItemType = $storeItemType ? $storeItemType : $ext;
1539
1540 // Remove any current data for this phash:
1541 $this->removeOldIndexedFiles($hash['phash']);
1542
1543 // Split filename:
1544 $fileParts = parse_url($file);
1545
1546 // Setting new
1547 $fields = array(
1548 'phash' => $hash['phash'],
1549 'phash_grouping' => $hash['phash_grouping'],
1550 'cHashParams' => serialize($subinfo),
1551 'contentHash' => $content_md5h,
1552 'data_filename' => $file,
1553 'item_type' => $storeItemType,
1554 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1555 'item_description' => $this->bodyDescription($contentParts),
1556 'item_mtime' => $mtime,
1557 'item_size' => $size,
1558 'item_crdate' => $ctime,
1559 'tstamp' => $GLOBALS['EXEC_TIME'],
1560 'crdate' => $GLOBALS['EXEC_TIME'],
1561 'gr_list' => $this->conf['gr_list'],
1562 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1563 'recordUid' => intval($this->conf['recordUid']),
1564 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1565 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1566 );
1567 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1568
1569 // PROCESSING index_fulltext
1570 $fields = array(
1571 'phash' => $hash['phash'],
1572 'fulltextdata' => implode(' ', $contentParts)
1573 );
1574 if ($this->indexerConfig['fullTextDataLength']>0) {
1575 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1576 }
1577 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1578
1579 // PROCESSING index_debug
1580 if ($this->indexerConfig['debugMode']) {
1581 $fields = array(
1582 'phash' => $hash['phash'],
1583 'debuginfo' => serialize(array(
1584 'cHashParams' => $subinfo,
1585 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
1586 'logs' => $this->internal_log,
1587 'lexer' => $this->lexerObj->debugString,
1588 ))
1589 );
1590 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1591 }
1592 }
1593
1594 /**
1595 * Stores file gr_list for a file IF it does not exist already
1596 *
1597 * @param integer phash value of file
1598 * @return void
1599 */
1600 function submitFile_grlist($hash) {
1601 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1602 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
1603 'phash',
1604 'index_grlist',
1605 'phash=' . intval($hash) .
1606 ' AND (hash_gr_list=' . $this->md5inthash($this->defaultGrList) .
1607 ' OR hash_gr_list=' . $this->md5inthash($this->conf['gr_list']) . ')'
1608 );
1609 if (!$count) {
1610 $this->submit_grlist($hash,$hash);
1611 }
1612 }
1613
1614 /**
1615 * Stores file section for a file IF it does not exist
1616 *
1617 * @param integer phash value of file
1618 * @return void
1619 */
1620 function submitFile_section($hash) {
1621 // Testing if there is a section
1622 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
1623 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1624 $this->submit_section($hash,$this->hash['phash']);
1625 }
1626 }
1627
1628 /**
1629 * Removes records for the indexed page, $phash
1630 *
1631 * @param integer phash value to flush
1632 * @return void
1633 */
1634 function removeOldIndexedFiles($phash) {
1635
1636 // Removing old registrations for tables.
1637 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
1638 foreach($tableArr as $table) {
1639 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1640 }
1641 }
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656 /********************************
1657 *
1658 * SQL Helper functions
1659 *
1660 *******************************/
1661
1662 /**
1663 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1664 * Return positive integer if the page needs to be indexed
1665 *
1666 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1667 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1668 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1669 */
1670 function checkMtimeTstamp($mtime,$phash) {
1671
1672 // Select indexed page:
1673 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
1674 $out = 0;
1675
1676 // If there was an indexing of the page...:
1677 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1678 if ($this->tstamp_maxAge && ($row['tstamp'] + $this->tstamp_maxAge) < $GLOBALS['EXEC_TIME']) { // If max age is exceeded, index the page
1679 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed.
1680 } else {
1681 if (!$this->tstamp_minAge || ($row['tstamp'] + $this->tstamp_minAge) < $GLOBALS['EXEC_TIME']) { // if minAge is not set or if minAge is exceeded, consider at mtime
1682 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1683 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index.
1684 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1685 } else {
1686 $out = -1; // mtime matched the document, so no changes detected and no content updated
1687 if ($this->tstamp_maxAge) {
1688 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1689 } else {
1690 $this->updateTstamp($phash); // Update the timestatmp
1691 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
1692 }
1693 }
1694 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed.
1695 } else {$out = -2;} // The minimum age was not exceeded
1696 }
1697 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table).
1698 return $out;
1699 }
1700
1701 /**
1702 * Check content hash in phash table
1703 *
1704 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1705 */
1706 function checkContentHash() {
1707 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1708 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
1709 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1710 return $row;
1711 }
1712 return 1;
1713 }
1714
1715 /**
1716 * Check content hash for external documents
1717 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1718 *
1719 * @param integer phash value to check (phash_grouping)
1720 * @param integer Content hash to check
1721 * @return boolean Returns TRUE if the document needs to be indexed (that is, there was no result)
1722 */
1723 function checkExternalDocContentHash($hashGr,$content_md5h) {
1724 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
1725 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1726 return 0;
1727 }
1728 return 1;
1729 }
1730
1731 /**
1732 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1733 *
1734 * @param integer Phash integer to test.
1735 * @return void
1736 */
1737 function is_grlist_set($phash_x) {
1738 return $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
1739 'phash_x',
1740 'index_grlist',
1741 'phash_x=' . intval($phash_x)
1742 );
1743 }
1744
1745 /**
1746 * Check if an grlist-entry for this hash exists and if not so, write one.
1747 *
1748 * @param integer phash of the search result that should be found
1749 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1750 * @return void
1751 * @see submit_grlist()
1752 */
1753 function update_grlist($phash,$phash_x) {
1754 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
1755 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1756 $this->submit_grlist($phash,$phash_x);
1757 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
1758 }
1759 }
1760
1761 /**
1762 * Update tstamp for a phash row.
1763 *
1764 * @param integer phash value
1765 * @param integer If set, update the mtime field to this value.
1766 * @return void
1767 */
1768 function updateTstamp($phash,$mtime=0) {
1769 $updateFields = array(
1770 'tstamp' => $GLOBALS['EXEC_TIME']
1771 );
1772 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
1773
1774 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1775 }
1776
1777 /**
1778 * Update SetID of the index_phash record.
1779 *
1780 * @param integer phash value
1781 * @return void
1782 */
1783 function updateSetId($phash) {
1784 $updateFields = array(
1785 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1786 );
1787
1788 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1789 }
1790
1791 /**
1792 * Update parsetime for phash row.
1793 *
1794 * @param integer phash value.
1795 * @param integer Parsetime value to set.
1796 * @return void
1797 */
1798 function updateParsetime($phash,$parsetime) {
1799 $updateFields = array(
1800 'parsetime' => intval($parsetime)
1801 );
1802
1803 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1804 }
1805
1806 /**
1807 * Update section rootline for the page
1808 *
1809 * @return void
1810 */
1811 function updateRootline() {
1812
1813 $updateFields = array();
1814 $this->getRootLineFields($updateFields);
1815
1816 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
1817 }
1818
1819 /**
1820 * Adding values for root-line fields.
1821 * rl0, rl1 and rl2 are standard. A hook might add more.
1822 *
1823 * @param array Field array, passed by reference
1824 * @return void
1825 */
1826 function getRootLineFields(&$fieldArr) {
1827
1828 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
1829 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
1830 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
1831
1832 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1833 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1834 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1835 }
1836 }
1837 }
1838
1839 /**
1840 * Removes any indexed pages with userlogins which has the same contentHash
1841 * NOT USED anywhere inside this class!
1842 *
1843 * @return void
1844 */
1845 function removeLoginpagesWithContentHash() {
1846 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
1847 A.phash=B.phash
1848 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
1849 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
1850 AND A.contentHash='.intval($this->content_md5h));
1851 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1852 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
1853 $this->removeOldIndexedPages($row['phash']);
1854 }
1855 }
1856
1857 /**
1858 * Includes the crawler class
1859 *
1860 * @return void
1861 */
1862 function includeCrawlerClass() {
1863 global $TYPO3_CONF_VARS;
1864
1865 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
1866 }
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877 /********************************
1878 *
1879 * SQL; Submitting words
1880 *
1881 *******************************/
1882
1883 /**
1884 * Adds new words to db
1885 *
1886 * @param array Word List array (where each word has information about position etc).
1887 * @return void
1888 */
1889 function checkWordList($wl) {
1890 $phashArr = array();
1891 foreach ($wl as $key => $value) {
1892 $phashArr[] = $wl[$key]['hash'];
1893 }
1894 if (count($phashArr)) {
1895 $cwl = implode(',',$phashArr);
1896 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
1897
1898 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
1899 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
1900 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1901 unset($wl[$row['baseword']]);
1902 }
1903
1904 foreach ($wl as $key => $val) {
1905 $insertFields = array(
1906 'wid' => $val['hash'],
1907 'baseword' => $key,
1908 'metaphone' => $val['metaphone']
1909 );
1910 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1911 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1912 }
1913 }
1914 }
1915 }
1916
1917 /**
1918 * Submits RELATIONS between words and phash
1919 *
1920 * @param array Word list array
1921 * @param integer phash value
1922 * @return void
1923 */
1924 function submitWords($wl,$phash) {
1925 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
1926
1927 foreach($wl as $val) {
1928 $insertFields = array(
1929 'phash' => $phash,
1930 'wid' => $val['hash'],
1931 'count' => $val['count'],
1932 'first' => $val['first'],
1933 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
1934 'flags' => ($val['cmp'] & $this->flagBitMask)
1935 );
1936
1937 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
1938 }
1939 }
1940
1941 /**
1942 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1943 * and back.
1944 *
1945 * @param double Frequency
1946 * @return integer Frequency in range.
1947 */
1948 function freqMap($freq) {
1949 $mapFactor = $this->freqMax*100*$this->freqRange;
1950 if($freq<1) {
1951 $newFreq = $freq*$mapFactor;
1952 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
1953 } else {
1954 $newFreq = $freq/$mapFactor;
1955 }
1956 return $newFreq;
1957
1958 }
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970 /********************************
1971 *
1972 * Hashing
1973 *
1974 *******************************/
1975
1976 /**
1977 * Get search hash, T3 pages
1978 *
1979 * @return void
1980 */
1981 function setT3Hashes() {
1982
1983 // Set main array:
1984 $hArray = array(
1985 'id' => (integer)$this->conf['id'],
1986 'type' => (integer)$this->conf['type'],
1987 'sys_lang' => (integer)$this->conf['sys_language_uid'],
1988 'MP' => (string)$this->conf['MP'],
1989 'cHash' => $this->cHashParams
1990 );
1991
1992 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1993 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1994
1995 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1996 $hArray['gr_list'] = (string)$this->conf['gr_list'];
1997 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
1998 }
1999
2000 /**
2001 * Get search hash, external files
2002 *
2003 * @param string File name / path which identifies it on the server
2004 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2005 * @return array Array with "phash_grouping" and "phash" inside.
2006 */
2007 function setExtHashes($file,$subinfo=array()) {
2008 // Set main array:
2009 $hash = array();
2010 $hArray = array(
2011 'file' => $file,
2012 );
2013
2014 // Set grouping hash:
2015 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
2016
2017 // Add subinfo
2018 $hArray['subinfo'] = $subinfo;
2019 $hash['phash'] = $this->md5inthash(serialize($hArray));
2020
2021 return $hash;
2022 }
2023
2024 /**
2025 * md5 integer hash
2026 * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
2027 *
2028 * @param string String to hash
2029 * @return integer Integer intepretation of the md5 hash of input string.
2030 */
2031 function md5inthash($str) {
2032 return hexdec(substr(md5($str),0,7));
2033 }
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045 /*********************************
2046 *
2047 * Internal logging functions
2048 *
2049 *********************************/
2050
2051 /**
2052 * Push function wrapper for TT logging
2053 *
2054 * @param string Title to set
2055 * @param string Key (?)
2056 * @return void
2057 */
2058 function log_push($msg,$key) {
2059 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
2060 }
2061
2062 /**
2063 * Pull function wrapper for TT logging
2064 *
2065 * @return void
2066 */
2067 function log_pull() {
2068 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
2069 }
2070
2071 /**
2072 * Set log message function wrapper for TT logging
2073 *
2074 * @param string Message to set
2075 * @param integer Error number
2076 * @return void
2077 */
2078 function log_setTSlogMessage($msg, $errorNum=0) {
2079 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
2080 $this->internal_log[] = $msg;
2081 }
2082
2083
2084
2085
2086
2087
2088
2089
2090 /**************************
2091 *
2092 * tslib_fe hooks:
2093 *
2094 **************************/
2095
2096 /**
2097 * Makes sure that keywords are space-separated. This is impotant for their
2098 * proper displaying as a part of fulltext index.
2099 *
2100 * @param string $keywordList
2101 * @return string
2102 * @see http://bugs.typo3.org/view.php?id=1436
2103 */
2104 protected function addSpacesToKeywordList($keywordList) {
2105 $keywords = t3lib_div::trimExplode(',', $keywordList);
2106 return ' ' . implode(', ', $keywords) . ' ';
2107 }
2108 }
2109
2110
2111 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php'])) {
2112 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
2113 }
2114 ?>