Hooks in tslib_fe, Frontend login mode feature for pages; various other things. see...
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 135: class tx_indexedsearch_indexer
39 * 198: function hook_indexContent(&$pObj)
40 *
41 * SECTION: Backend API
42 * 283: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
43 * 320: function backend_setFreeIndexUid($freeIndexUid)
44 * 337: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
45 *
46 * SECTION: Initialization
47 * 388: function init()
48 * 439: function initializeExternalParsers()
49 *
50 * SECTION: Indexing; TYPO3 pages (HTML content)
51 * 480: function indexTypo3PageContent()
52 * 564: function splitHTMLContent($content)
53 * 610: function getHTMLcharset($content)
54 * 625: function convertHTMLToUtf8($content,$charset='')
55 * 653: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
56 * 680: function typoSearchTags(&$body)
57 * 709: function extractLinks($content)
58 * 752: function extractHyperLinks($string)
59 *
60 * SECTION: Indexing; external URL
61 * 804: function indexExternalUrl($externalUrl)
62 * 835: function getUrlHeaders($url, $timeout = 2)
63 *
64 * SECTION: Indexing; external files (PDF, DOC, etc)
65 * 895: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
66 * 1001: function readFileContent($ext,$absFile,$cPKey)
67 * 1018: function fileContentParts($ext,$absFile)
68 * 1036: function splitRegularContent($content)
69 *
70 * SECTION: Analysing content, Extracting words
71 * 1069: function charsetEntity2utf8(&$contentArr, $charset)
72 * 1091: function procesWordsInArrays($contentArr)
73 * 1114: function bodyDescription($contentArr)
74 * 1135: function indexAnalyze($content)
75 * 1156: function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
76 * 1175: function analyzeBody(&$retArr,$content)
77 * 1195: function metaphone($word,$retRaw=FALSE)
78 *
79 * SECTION: SQL; TYPO3 Pages
80 * 1237: function submitPage()
81 * 1306: function submit_grlist($hash,$phash_x)
82 * 1326: function submit_section($hash,$hash_t3)
83 * 1344: function removeOldIndexedPages($phash)
84 *
85 * SECTION: SQL; External media
86 * 1387: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
87 * 1449: function submitFile_grlist($hash)
88 * 1463: function submitFile_section($hash)
89 * 1477: function removeOldIndexedFiles($phash)
90 *
91 * SECTION: SQL Helper functions
92 * 1513: function checkMtimeTstamp($mtime,$phash)
93 * 1549: function checkContentHash()
94 * 1566: function checkExternalDocContentHash($hashGr,$content_md5h)
95 * 1580: function is_grlist_set($phash_x)
96 * 1593: function update_grlist($phash,$phash_x)
97 * 1608: function updateTstamp($phash,$mtime=0)
98 * 1624: function updateParsetime($phash,$parsetime)
99 * 1637: function updateRootline()
100 * 1652: function getRootLineFields(&$fieldArr)
101 * 1671: function removeLoginpagesWithContentHash()
102 *
103 * SECTION: SQL; Submitting words
104 * 1706: function checkWordList($wl)
105 * 1743: function submitWords($wl,$phash)
106 * 1767: function freqMap($freq)
107 *
108 * SECTION: Hashing
109 * 1800: function setT3Hashes()
110 * 1826: function setExtHashes($file,$subinfo=array())
111 * 1850: function md5inthash($str)
112 * 1860: function makeCHash($paramArray)
113 *
114 * SECTION: Internal logging functions
115 * 1902: function log_push($msg,$key)
116 * 1911: function log_pull()
117 * 1922: function log_setTSlogMessage($msg, $errorNum=0)
118 *
119 * TOTAL FUNCTIONS: 55
120 * (This index is automatically created/updated by the extension "extdeveval")
121 *
122 */
123
124
125 require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
126
127
128 /**
129 * Indexing class for TYPO3 frontend
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @package TYPO3
133 * @subpackage tx_indexedsearch
134 */
135 class tx_indexedsearch_indexer {
136
137 // Messages:
138 var $reasons = array(
139 -1 => 'mtime matched the document, so no changes detected and no content updated',
140 -2 => 'The minimum age was not exceeded',
141 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
142 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
143 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
144 4 => 'Page has never been indexed (is not represented in the index_phash table).'
145 );
146
147 // HTML code blocks to exclude from indexing:
148 var $excludeSections = 'script,style';
149
150 // Supported Extensions for external files:
151 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
152
153 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
154 var $defaultGrList = '0,-1';
155
156 // Min/Max times:
157 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
158 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
159 var $maxExternalFiles = 0; // Max number of external files to index.
160
161 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc.
162 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
163
164 // INTERNALS:
165 var $defaultContentArray=array(
166 'title' => '',
167 'description' => '',
168 'keywords' => '',
169 'body' => '',
170 );
171 var $wordcount = 0;
172 var $externalFileCounter = 0;
173
174 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
175 var $indexerConfig = array(); // Indexer configuration
176 var $hash = array(); // Hash array, contains phash and phash_grouping
177 var $file_phash_arr = array(); // Hash array for files
178 var $contentParts = array(); // Content of TYPO3 page
179 var $content_md5h = '';
180 var $internal_log = array(); // Internal log
181 var $indexExternalUrl_content = '';
182
183 var $cHashParams = array(); // cHashparams array
184
185 var $freqRange = 65000;
186 var $freqMax = 0.1;
187
188 // Objects:
189 var $csObj; // Charset class object , t3lib_cs
190 var $metaphoneObj; // Metaphone object, if any
191 var $lexerObj; // Lexer object for word splitting
192
193
194
195 /**
196 * Parent Object (TSFE) Initialization
197 *
198 * @param object Parent Object (frontend TSFE object), passed by reference
199 * @return void
200 */
201 function hook_indexContent(&$pObj) {
202
203 // Indexer configuration from Extension Manager interface:
204 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
205
206 // Crawler activation:
207 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
208 if (t3lib_extMgm::isLoaded('crawler')
209 && $pObj->applicationData['tx_crawler']['running']
210 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
211
212 // Setting simple log message:
213 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
214
215 // Setting variables:
216 $this->crawlerActive = TRUE; // Crawler active flag
217 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
218 }
219
220 // Determine if page should be indexed, and if so, configure and initialize indexer
221 if ($pObj->config['config']['index_enable']) {
222 $this->log_push('Index page','');
223
224 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
225 if (!$pObj->page['no_search']) {
226 if (!$pObj->no_cache) {
227
228 // Setting up internal configuration from config array:
229 $this->conf = array();
230
231 // Information about page for which the indexing takes place
232 $this->conf['id'] = $pObj->id; // Page id
233 $this->conf['type'] = $pObj->type; // Page type
234 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
235 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
236 $this->conf['gr_list'] = $pObj->gr_list; // Group list
237
238 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
239 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
240
241 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
242 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
243
244 // Root line uids
245 $this->conf['rootline_uids'] = array();
246 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
247 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
248 }
249
250 // Content of page:
251 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
252 $this->conf['indexedDocTitle'] = $pObj->indexedDocTitle; // Alternative title for indexing
253 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
254 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
255
256 // Configuration of behavior:
257 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
258 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
259
260 // Set to zero:
261 $this->conf['recordUid'] = 0;
262 $this->conf['freeIndexUid'] = 0;
263
264 // Init and start indexing:
265 $this->init();
266 $this->indexTypo3PageContent();
267
268 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
269 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page header!');
270 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
271 }
272 $this->log_pull();
273 }
274
275
276
277
278
279
280
281
282 /****************************
283 *
284 * Backend API
285 *
286 ****************************/
287
288 /**
289 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
290 *
291 * @param integer The page uid, &id=
292 * @param integer The page type, &type=
293 * @param integer sys_language uid, typically &L=
294 * @param string The MP variable (Mount Points), &MP=
295 * @param array Rootline array of only UIDs.
296 * @param array Array of GET variables to register with this indexing
297 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
298 * @return void
299 */
300 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
301
302 // Setting up internal configuration from config array:
303 $this->conf = array();
304
305 // Information about page for which the indexing takes place
306 $this->conf['id'] = $id; // Page id (integer)
307 $this->conf['type'] = $type; // Page type (integer)
308 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
309 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
310 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
311
312 // cHash values:
313 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : ''; // cHash string for additional parameters
314 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
315
316 // Set to defaults
317 $this->conf['freeIndexUid'] = 0;
318 $this->conf['page_cache_reg1'] = '';
319
320 // Root line uids
321 $this->conf['rootline_uids'] = $uidRL;
322
323 // Configuration of behavior:
324 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
325 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
326
327 // Init and start indexing:
328 $this->init();
329 }
330
331 /**
332 * Sets the free-index uid. Can be called right after backend_initIndexer()
333 *
334 * @param integer Free index UID
335 * @return void
336 */
337 function backend_setFreeIndexUid($freeIndexUid) {
338 $this->conf['freeIndexUid'] = $freeIndexUid;
339 }
340
341 /**
342 * Indexing records as the content of a TYPO3 page.
343 *
344 * @param string Title equivalent
345 * @param string Keywords equivalent
346 * @param string Description equivalent
347 * @param string The main content to index
348 * @param string The charset of the title, keyword, description and body-content
349 * @param integer Last modification time, in seconds
350 * @param integer The creation date of the content, in seconds
351 * @param integer The record UID that the content comes from (for registration with the indexed rows)
352 * @return void
353 */
354 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
355
356 // Content of page:
357 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
358 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
359 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
360
361 // Construct fake HTML for parsing:
362 $this->conf['content'] = '
363 <html>
364 <head>
365 <title>'.htmlspecialchars($title).'</title>
366 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
367 <meta name="description" content="'.htmlspecialchars($description).'" />
368 </head>
369 <body>
370 '.htmlspecialchars($content).'
371 </body>
372 </html>'; // Content string (HTML of TYPO3 page)
373
374 // Initializing charset:
375 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
376 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
377
378 // Index content as if it was a TYPO3 page:
379 $this->indexTypo3PageContent();
380 }
381
382
383
384
385
386
387
388
389
390
391
392
393
394 /********************************
395 *
396 * Initialization
397 *
398 *******************************/
399
400 /**
401 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
402 *
403 * @return void
404 */
405 function init() {
406 global $TYPO3_CONF_VARS;
407
408 // Initializing:
409 $this->cHashParams = $this->conf['cHash_array'];
410 if (is_array($this->cHashParams) && count($this->cHashParams)) {
411 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
412 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
413 }
414
415 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
416 $this->setT3Hashes();
417
418 // Indexer configuration from Extension Manager interface:
419 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
420 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
421 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
422 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
423 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
424
425 // Initialize external document parsers:
426 // Example configuration, see ext_localconf.php of this file!
427 if ($this->conf['index_externals']) {
428 $this->initializeExternalParsers();
429 }
430
431 // Initialize lexer (class that deconstructs the text into words):
432 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
433 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
434 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
435 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
436 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
437 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
438
439 // Initialize metaphone hook:
440 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
441 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
442 $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
443 }
444
445 // Init charset class:
446 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
447 }
448
449 /**
450 * Initialize external parsers
451 *
452 * @return void
453 * @access private
454 * @see init()
455 */
456 function initializeExternalParsers() {
457 global $TYPO3_CONF_VARS;
458
459 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
460 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
461 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
462 $this->external_parsers[$extension]->pObj = &$this;
463
464 // Init parser and if it returns false, unset its entry again:
465 if (!$this->external_parsers[$extension]->initParser($extension)) {
466 unset($this->external_parsers[$extension]);
467 }
468 }
469 }
470 }
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486 /********************************
487 *
488 * Indexing; TYPO3 pages (HTML content)
489 *
490 *******************************/
491
492 /**
493 * Start indexing of the TYPO3 page
494 *
495 * @return void
496 */
497 function indexTypo3PageContent() {
498
499 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
500 $is_grlist = $this->is_grlist_set($this->hash['phash']);
501
502 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
503
504 // Setting message:
505 if ($this->forceIndexing) {
506 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
507 } elseif ($check > 0) {
508 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
509 } else {
510 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
511 }
512
513 // Divide into title,keywords,description and body:
514 $this->log_push('Split content','');
515 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
516 if ($this->conf['indexedDocTitle']) {
517 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
518 }
519 $this->log_pull();
520
521 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
522 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
523
524 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
525 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
526 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
527 $checkCHash = $this->checkContentHash();
528 if (!is_array($checkCHash) || $check===1) {
529 $Pstart=t3lib_div::milliseconds();
530
531 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
532 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
533 $this->log_pull();
534
535 // Splitting words
536 $this->log_push('Extract words from content','');
537 $splitInWords = $this->procesWordsInArrays($this->contentParts);
538 $this->log_pull();
539
540 // Analyse the indexed words.
541 $this->log_push('Analyse the extracted words','');
542 $indexArr = $this->indexAnalyze($splitInWords);
543 $this->log_pull();
544
545 // Submitting page (phash) record
546 $this->log_push('Submitting page','');
547 $this->submitPage();
548 $this->log_pull();
549
550 // Check words and submit to word list if not there
551 $this->log_push('Check word list and submit words','');
552 $this->checkWordList($indexArr);
553 $this->submitWords($indexArr,$this->hash['phash']);
554 $this->log_pull();
555
556 // Set parsetime
557 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
558
559 // Checking external files if configured for.
560 $this->log_push('Checking external files','');
561 if ($this->conf['index_externals']) {
562 $this->extractLinks($this->conf['content']);
563 }
564 $this->log_pull();
565 } else {
566 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
567 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
568 $this->updateRootline();
569 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
570 }
571 } else {
572 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
573 }
574 }
575
576 /**
577 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
578 *
579 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
580 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
581 * @see splitRegularContent()
582 */
583 function splitHTMLContent($content) {
584
585 // divide head from body ( u-ouh :) )
586 $contentArr = $this->defaultContentArray;
587 $contentArr['body'] = stristr($content,'<body');
588 $headPart = substr($content,0,-strlen($contentArr['body']));
589
590 // get title
591 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
592 $titleParts = explode(':',$contentArr['title'],2);
593 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
594
595 // get keywords and description metatags
596 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
597 for($i=0;isset($meta[$i]);$i++) {
598 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
599 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
600 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
601 }
602
603 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
604 $this->typoSearchTags($contentArr['body']);
605
606 // Get rid of unwanted sections (ie. scripting and style stuff) in body
607 $tagList = explode(',',$this->excludeSections);
608 foreach($tagList as $tag) {
609 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
610 }
611
612 // remove tags, but first make sure we don't concatenate words by doing it
613 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
614 $contentArr['body'] = trim(strip_tags($contentArr['body']));
615
616 $contentArr['keywords'] = trim($contentArr['keywords']);
617 $contentArr['description'] = trim($contentArr['description']);
618
619 // Return array
620 return $contentArr;
621 }
622
623 /**
624 * Extract the charset value from HTML meta tag.
625 *
626 * @param string HTML content
627 * @return string The charset value if found.
628 */
629 function getHTMLcharset($content) {
630 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg)) {
631 if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2)) {
632 return $reg2[1];
633 }
634 }
635 }
636
637 /**
638 * Converts a HTML document to utf-8
639 *
640 * @param string HTML content, any charset
641 * @param string Optional charset (otherwise extracted from HTML)
642 * @return string Converted HTML
643 */
644 function convertHTMLToUtf8($content,$charset='') {
645
646 // Find charset:
647 $charset = $charset ? $charset : $this->getHTMLcharset($content);
648 $charset = $this->csObj->parse_charset($charset);
649
650 // Convert charset:
651 if ($charset && $charset!=='utf-8') {
652 $content = $this->csObj->utf8_encode($content, $charset);
653 }
654 // Convert entities, assuming document is now UTF-8:
655 $content = $this->csObj->entities_to_utf8($content, TRUE);
656
657 return $content;
658 }
659
660 /**
661 * Finds first occurence of embracing tags and returns the embraced content and the original string with
662 * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
663 * <title> of document or removing <script>-sections
664 *
665 * @param string String to search in
666 * @param string Tag name, eg. "script"
667 * @param string Passed by reference: Content inside found tag
668 * @param string Passed by reference: Content after found tag
669 * @param string Passed by reference: Attributes of the found tag.
670 * @return boolean Returns false if tag was not found, otherwise true.
671 */
672 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
673 $endTag = '</'.$tagName.'>';
674 $startTag = '<'.$tagName;
675
676 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
677 if(!$isTagInText) return false; // if the tag was not found, return false
678
679 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
680 $afterTagInText = stristr($isTagInText,$endTag);
681 if ($afterTagInText) {
682 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
683 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
684 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
685 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
686 $tagContent='';
687 $stringAfter = $isTagInText;
688 }
689
690 return true;
691 }
692
693 /**
694 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
695 *
696 * @param string HTML Content, passed by reference
697 * @return boolean Returns true if a TYPOSEARCH_ tag was found, otherwise false.
698 */
699 function typoSearchTags(&$body) {
700 $expBody = explode('<!--TYPO3SEARCH_',$body);
701
702 if(count($expBody)>1) {
703 $body = '';
704
705 foreach($expBody as $val) {
706 $part = explode('-->',$val,2);
707 if(trim($part[0])=='begin') {
708 $body.= $part[1];
709 $prev = '';
710 } elseif(trim($part[0])=='end') {
711 $body.= $prev;
712 } else {
713 $prev = $val;
714 }
715 }
716 return true;
717 } else {
718 return false;
719 }
720 }
721
722 /**
723 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
724 *
725 * @param string HTML content
726 * @return void
727 */
728 function extractLinks($content) {
729
730 // Get links:
731 $list = $this->extractHyperLinks($content);
732
733 // Traverse links:
734 foreach($list as $linkInfo) {
735
736 // Decode entities:
737 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
738
739 // Parse URL:
740 $qParts = parse_url($linkSource);
741
742 // Check for jumpurl (TYPO3 specific thing...)
743 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
744 parse_str($qParts['query'],$getP);
745 $linkSource = $getP['jumpurl'];
746 $qParts = parse_url($linkSource); // parse again due to new linkSource!
747 }
748
749 if ($qParts['scheme']) {
750 if ($this->indexerConfig['indexExternalURLs']) {
751 // Index external URL (http or otherwise)
752 $this->indexExternalUrl($linkSource);
753 }
754 } elseif (!$qParts['query']) {
755 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
756 if ($localFile && @is_file($localFile)) {
757 // Index local file:
758 $this->indexRegularDocument($linkSource);
759 }
760 }
761 }
762 }
763
764 /**
765 * Extracts all links to external documents from content string.
766 *
767 * @param string Content to analyse
768 * @return array Array of hyperlinks
769 * @see extractLinks()
770 */
771 function extractHyperLinks($string) {
772 if (!is_object($this->htmlParser)) {
773 $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
774 }
775
776 $parts = $this->htmlParser->splitTags('a',$string);
777 $list = array();
778 foreach($parts as $k => $v) {
779 if ($k%2) {
780 $params = $this->htmlParser->get_tag_attributes($v,1);
781 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
782
783 switch(strtolower($firstTagName)) {
784 case 'a':
785 $src = $params[0]['href'];
786 if ($src) {
787 $list[] = array(
788 'tag' => $v,
789 'href' => $params[0]['href']
790 );
791 }
792 break;
793 }
794 }
795 }
796
797 return $list;
798 }
799
800
801
802
803
804
805
806
807
808
809
810 /******************************************
811 *
812 * Indexing; external URL
813 *
814 ******************************************/
815
816 /**
817 * Index External URLs HTML content
818 *
819 * @param string URL, eg. "http://typo3.org/"
820 * @return void
821 * @see indexRegularDocument()
822 */
823 function indexExternalUrl($externalUrl) {
824
825 // Parse External URL:
826 $qParts = parse_url($externalUrl);
827 $fI = pathinfo($qParts['path']);
828 $ext = strtolower($fI['extension']);
829
830 // Get headers:
831 $urlHeaders = $this->getUrlHeaders($externalUrl);
832 if (stristr($urlHeaders['Content-Type'],'text/html')) {
833 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
834 if (strlen($content)) {
835
836 // Create temporary file:
837 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
838 t3lib_div::writeFile($tmpFile, $content);
839
840 // Index that file:
841 $this->indexRegularDocument($externalUrl, FALSE, $tmpFile, 'html');
842 unlink($tmpFile);
843 }
844 }
845 }
846
847 /**
848 * Getting HTTP request headers of URL
849 *
850 * @param string The URL
851 * @param integer Timeout (seconds?)
852 * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys
853 */
854 function getUrlHeaders($url, $timeout = 2) {
855 $url = parse_url($url);
856
857 if(!in_array($url['scheme'],array('','http'))) return FALSE;
858
859 $fp = fsockopen ($url['host'], ($url['port'] > 0 ? $url['port'] : 80), $errno, $errstr, $timeout);
860 if (!$fp) {
861 return FALSE;
862 } else {
863 $msg = "GET ".$url['path'].($url['query'] ? '?'.$url['query'] : '')." HTTP/1.0\r\nHost: ".$url['host']."\r\n\r\n";
864 fputs ($fp, $msg);
865 $d = '';
866 while (!feof($fp)) {
867 $line = fgets ($fp,2048);
868
869 $d.=$line;
870 if (!strlen(trim($line))) {
871 break;
872 }
873 }
874 fclose ($fp);
875
876 // Compile headers:
877 $headers = t3lib_div::trimExplode(chr(10),$d,1);
878 $retVal = array();
879 foreach($headers as $line) {
880 list($headKey, $headValue) = explode(':', $line, 2);
881 $retVal[$headKey] = $headValue;
882 }
883 return $retVal;
884 }
885 }
886
887
888
889
890
891
892
893
894
895
896
897
898
899 /******************************************
900 *
901 * Indexing; external files (PDF, DOC, etc)
902 *
903 ******************************************/
904
905 /**
906 * Indexing a regular document given as $file (relative to PATH_site, local file)
907 *
908 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
909 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
910 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
911 * @param string File extension for temporary file.
912 * @return void
913 */
914 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
915
916 // Init
917 $fI = pathinfo($file);
918 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
919
920 // Create abs-path:
921 if (!$contentTmpFile) {
922 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
923 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
924 } else { // Absolute, pass-through:
925 $absFile = $file;
926 }
927 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
928 } else {
929 $absFile = $contentTmpFile;
930 }
931
932 // Indexing the document:
933 if ($absFile && @is_file($absFile)) {
934 if ($this->external_parsers[$ext]) {
935 $mtime = filemtime($absFile);
936 $cParts = $this->fileContentParts($ext,$absFile);
937
938 foreach($cParts as $cPKey) {
939 $this->internal_log = array();
940 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
941 $Pstart = t3lib_div::milliseconds();
942 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
943 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
944 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
945 if ($check > 0 || $force) {
946 if ($check > 0) {
947 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
948 } else {
949 $this->log_setTSlogMessage('Indexing forced by flag',1);
950 }
951
952 // Check external file counter:
953 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
954
955 // Divide into title,keywords,description and body:
956 $this->log_push('Split content','');
957 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
958 $this->log_pull();
959
960 if (is_array($contentParts)) {
961 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
962 $content_md5h = $this->md5inthash(implode($contentParts,''));
963
964 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
965
966 // Increment counter:
967 $this->externalFileCounter++;
968
969 // Splitting words
970 $this->log_push('Extract words from content','');
971 $splitInWords = $this->procesWordsInArrays($contentParts);
972 $this->log_pull();
973
974 // Analyse the indexed words.
975 $this->log_push('Analyse the extracted words','');
976 $indexArr = $this->indexAnalyze($splitInWords);
977 $this->log_pull();
978
979 // Submitting page (phash) record
980 $this->log_push('Submitting page','');
981 $size = filesize($absFile);
982 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
983 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
984 $this->log_pull();
985
986 // Check words and submit to word list if not there
987 $this->log_push('Check word list and submit words','');
988 $this->checkWordList($indexArr);
989 $this->submitWords($indexArr,$phash_arr['phash']);
990 $this->log_pull();
991
992 // Set parsetime
993 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
994 } else {
995 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
996 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
997 }
998 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
999 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1000 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1001
1002 // Checking and setting sections:
1003 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
1004 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1005 $this->log_pull();
1006 }
1007 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1008 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1009 }
1010
1011 /**
1012 * Reads the content of an external file being indexed.
1013 * The content from the external parser MUST be returned in utf-8!
1014 *
1015 * @param string File extension, eg. "pdf", "doc" etc.
1016 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1017 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1018 * @return array Standard content array (title, description, keywords, body keys)
1019 */
1020 function readFileContent($ext,$absFile,$cPKey) {
1021
1022 // Consult relevant external document parser:
1023 if (is_object($this->external_parsers[$ext])) {
1024 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
1025 }
1026
1027 return $contentArr;
1028 }
1029
1030 /**
1031 * Creates an array with pointers to divisions of document.
1032 *
1033 * @param string File extension
1034 * @param string Absolute filename (must exist and be validated OK before calling function)
1035 * @return array Array of pointers to sections that the document should be divided into
1036 */
1037 function fileContentParts($ext,$absFile) {
1038 $cParts = array(0);
1039
1040 // Consult relevant external document parser:
1041 if (is_object($this->external_parsers[$ext])) {
1042 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
1043 }
1044
1045 return $cParts;
1046 }
1047
1048 /**
1049 * Splits non-HTML content (from external files for instance)
1050 *
1051 * @param string Input content (non-HTML) to index.
1052 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1053 * @see splitHTMLContent()
1054 */
1055 function splitRegularContent($content) {
1056 $contentArr = $this->defaultContentArray;
1057 $contentArr['body'] = $content;
1058
1059 return $contentArr;
1060 }
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075 /**********************************
1076 *
1077 * Analysing content, Extracting words
1078 *
1079 **********************************/
1080
1081 /**
1082 * Convert character set and HTML entities in the value of input content array keys
1083 *
1084 * @param array Standard content array
1085 * @param string Charset of the input content (converted to utf-8)
1086 * @return void
1087 */
1088 function charsetEntity2utf8(&$contentArr, $charset) {
1089
1090 // Convert charset if necessary
1091 reset($contentArr);
1092 while(list($key,)=each($contentArr)) {
1093 if (strlen($contentArr[$key])) {
1094
1095 if ($charset!=='utf-8') {
1096 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1097 }
1098
1099 // decode all numeric / html-entitiesin in the string to real characters:
1100 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
1101 }
1102 }
1103 }
1104
1105 /**
1106 * Processing words in the array from split*Content -functions
1107 *
1108 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1109 * @return array Content input array modified so each key is not a unique array of words
1110 */
1111 function procesWordsInArrays($contentArr) {
1112
1113 // split all parts to words
1114 reset($contentArr);
1115 while(list($key,)=each($contentArr)) {
1116 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1117 }
1118
1119 // For title, keywords, and description we don't want duplicates:
1120 $contentArr['title'] = array_unique($contentArr['title']);
1121 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1122 $contentArr['description'] = array_unique($contentArr['description']);
1123
1124 // Return modified array:
1125 return $contentArr;
1126 }
1127
1128 /**
1129 * Extracts the sample description text from the content array.
1130 *
1131 * @param array Content array
1132 * @return string Description string
1133 */
1134 function bodyDescription($contentArr) {
1135
1136 // Setting description
1137 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
1138 if ($maxL) {
1139 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
1140 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
1141 $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
1142
1143 // Shorten the string:
1144 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1145 }
1146
1147 return $bodyDescription;
1148 }
1149
1150 /**
1151 * Analyzes content to use for indexing,
1152 *
1153 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1154 * @return array Index Array (whatever that is...)
1155 */
1156 function indexAnalyze($content) {
1157 $indexArr = Array();
1158 $counter = 0;
1159
1160 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
1161 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
1162 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
1163 $this->analyzeBody($indexArr,$content);
1164
1165 return ($indexArr);
1166 }
1167
1168 /**
1169 * Calculates relevant information for headercontent
1170 *
1171 * @param array Index array, passed by reference
1172 * @param array Standard content array
1173 * @param string Key from standard content array
1174 * @param integer Bit-wise priority to type
1175 * @return void
1176 */
1177 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
1178 reset($content[$key]);
1179 while(list(,$val)=each($content[$key])) {
1180 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1181 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
1182 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1183 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1184 $retArr[$val]['metaphone'] = $this->metaphone($val);
1185 $this->wordcount++;
1186 }
1187 }
1188
1189 /**
1190 * Calculates relevant information for bodycontent
1191 *
1192 * @param array Index array, passed by reference
1193 * @param array Standard content array
1194 * @return void
1195 */
1196 function analyzeBody(&$retArr,$content) {
1197 foreach($content['body'] as $key => $val) {
1198 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1199 if(!isset($retArr[$val])) {
1200 $retArr[$val]['first'] = $key;
1201 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1202 $retArr[$val]['metaphone'] = $this->metaphone($val);
1203 }
1204 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1205 $this->wordcount++;
1206 }
1207 }
1208
1209 /**
1210 * Creating metaphone based hash from input word
1211 *
1212 * @param string Word to convert
1213 * @param boolean If set, returns the raw metaphone value (not hashed)
1214 * @return mixed Metaphone hash integer (or raw value, string)
1215 */
1216 function metaphone($word,$retRaw=FALSE) {
1217
1218 if (is_object($this->metaphoneObj)) {
1219 $tmp = $this->metaphoneObj->metaphone($word);
1220 } else {
1221 $tmp = metaphone($word);
1222 }
1223
1224 // Return raw value?
1225 if ($retRaw) return $tmp;
1226
1227 // Otherwise create hash and return integer
1228 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
1229 return $ret;
1230 }
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247 /********************************
1248 *
1249 * SQL; TYPO3 Pages
1250 *
1251 *******************************/
1252
1253 /**
1254 * Updates db with information about the page (TYPO3 page, not external media)
1255 *
1256 * @return void
1257 */
1258 function submitPage() {
1259
1260 // Remove any current data for this phash:
1261 $this->removeOldIndexedPages($this->hash['phash']);
1262
1263 // setting new phash_row
1264 $fields = array(
1265 'phash' => $this->hash['phash'],
1266 'phash_grouping' => $this->hash['phash_grouping'],
1267 'cHashParams' => serialize($this->cHashParams),
1268 'contentHash' => $this->content_md5h,
1269 'data_page_id' => $this->conf['id'],
1270 'data_page_reg1' => $this->conf['page_cache_reg1'],
1271 'data_page_type' => $this->conf['type'],
1272 'data_page_mp' => $this->conf['MP'],
1273 'gr_list' => $this->conf['gr_list'],
1274 'item_type' => 0, // TYPO3 page
1275 'item_title' => $this->contentParts['title'],
1276 'item_description' => $this->bodyDescription($this->contentParts),
1277 'item_mtime' => $this->conf['mtime'],
1278 'item_size' => strlen($this->conf['content']),
1279 'tstamp' => time(),
1280 'crdate' => time(),
1281 'item_crdate' => $this->conf['crdate'], // Creation date of page
1282 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1283 'externalUrl' => 0,
1284 'recordUid' => intval($this->conf['recordUid']),
1285 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1286 );
1287 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1288
1289 // PROCESSING index_section
1290 $this->submit_section($this->hash['phash'],$this->hash['phash']);
1291
1292 // PROCESSING index_grlist
1293 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
1294
1295 // PROCESSING index_fulltext
1296 $fields = array(
1297 'phash' => $this->hash['phash'],
1298 'fulltextdata' => implode(' ', $this->contentParts)
1299 );
1300 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1301
1302 // PROCESSING index_debug
1303 if ($this->indexerConfig['debugMode']) {
1304 $fields = array(
1305 'phash' => $this->hash['phash'],
1306 'debuginfo' => serialize(array(
1307 'cHashParams' => $this->cHashParams,
1308 'external_parsers initialized' => array_keys($this->external_parsers),
1309 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
1310 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
1311 'logs' => $this->internal_log,
1312 'lexer' => $this->lexerObj->debugString,
1313 ))
1314 );
1315 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1316 }
1317 }
1318
1319 /**
1320 * Stores gr_list in the database.
1321 *
1322 * @param integer Search result record phash
1323 * @param integer Actual phash of current content
1324 * @return void
1325 * @see update_grlist()
1326 */
1327 function submit_grlist($hash,$phash_x) {
1328
1329 // Setting the gr_list record
1330 $fields = array(
1331 'phash' => $hash,
1332 'phash_x' => $phash_x,
1333 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
1334 'gr_list' => $this->conf['gr_list']
1335 );
1336 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1337 }
1338
1339 /**
1340 * Stores section
1341 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1342 *
1343 * @param integer phash of TYPO3 parent search result record
1344 * @param integer phash of the file indexation search record
1345 * @return void
1346 */
1347 function submit_section($hash,$hash_t3) {
1348 $fields = array(
1349 'phash' => $hash,
1350 'phash_t3' => $hash_t3,
1351 'page_id' => intval($this->conf['id'])
1352 );
1353
1354 $this->getRootLineFields($fields);
1355
1356 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1357 }
1358
1359 /**
1360 * Removes records for the indexed page, $phash
1361 *
1362 * @param integer phash value to flush
1363 * @return void
1364 */
1365 function removeOldIndexedPages($phash) {
1366 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1367 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
1368 foreach($tableArr as $table) {
1369 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1370 }
1371 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1372 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
1373 }
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387 /********************************
1388 *
1389 * SQL; External media
1390 *
1391 *******************************/
1392
1393
1394 /**
1395 * Updates db with information about the file
1396 *
1397 * @param array Array with phash and phash_grouping keys for file
1398 * @param string File name
1399 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1400 * @param string File extension determining the type of media.
1401 * @param integer Modification time of file.
1402 * @param integer Creation time of file.
1403 * @param integer Size of file in bytes
1404 * @param integer Content HASH value.
1405 * @param array Standard content array (using only title and body for a file)
1406 * @return void
1407 */
1408 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
1409
1410 // Find item Type:
1411 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1412 $storeItemType = $storeItemType ? $storeItemType : $ext;
1413
1414 // Remove any current data for this phash:
1415 $this->removeOldIndexedFiles($hash['phash']);
1416
1417 // Split filename:
1418 $fileParts = parse_url($file);
1419
1420 // setting new
1421 $fields = array(
1422 'phash' => $hash['phash'],
1423 'phash_grouping' => $hash['phash_grouping'],
1424 'cHashParams' => serialize($subinfo),
1425 'contentHash' => $content_md5h,
1426 'data_filename' => $file,
1427 'item_type' => $storeItemType,
1428 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1429 'item_description' => $this->bodyDescription($contentParts),
1430 'item_mtime' => $mtime,
1431 'item_size' => $size,
1432 'item_crdate' => $ctime,
1433 'tstamp' => time(),
1434 'crdate' => time(),
1435 'gr_list' => $this->conf['gr_list'],
1436 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1437 'recordUid' => intval($this->conf['recordUid']),
1438 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1439 );
1440 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1441
1442 // PROCESSING index_fulltext
1443 $fields = array(
1444 'phash' => $hash['phash'],
1445 'fulltextdata' => implode(' ', $contentParts)
1446 );
1447 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1448
1449 // PROCESSING index_debug
1450 if ($this->indexerConfig['debugMode']) {
1451 $fields = array(
1452 'phash' => $hash['phash'],
1453 'debuginfo' => serialize(array(
1454 'cHashParams' => $subinfo,
1455 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
1456 'logs' => $this->internal_log,
1457 'lexer' => $this->lexerObj->debugString,
1458 ))
1459 );
1460 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1461 }
1462 }
1463
1464 /**
1465 * Stores file gr_list for a file IF it does not exist already
1466 *
1467 * @param integer phash value of file
1468 * @return void
1469 */
1470 function submitFile_grlist($hash) {
1471 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1472 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
1473 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1474 $this->submit_grlist($hash,$hash);
1475 }
1476 }
1477
1478 /**
1479 * Stores file section for a file IF it does not exist
1480 *
1481 * @param integer phash value of file
1482 * @return void
1483 */
1484 function submitFile_section($hash) {
1485 // Testing if there is a section
1486 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
1487 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1488 $this->submit_section($hash,$this->hash['phash']);
1489 }
1490 }
1491
1492 /**
1493 * Removes records for the indexed page, $phash
1494 *
1495 * @param integer phash value to flush
1496 * @return void
1497 */
1498 function removeOldIndexedFiles($phash) {
1499
1500 // Removing old registrations for tables.
1501 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
1502 foreach($tableArr as $table) {
1503 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1504 }
1505 }
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520 /********************************
1521 *
1522 * SQL Helper functions
1523 *
1524 *******************************/
1525
1526 /**
1527 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1528 * Return positive integer if the page needs to being indexed!
1529 *
1530 * @param integer mtime value to test against limits and indexed page.
1531 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1532 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceed and so indexing cannot occur. -1) Mtimes matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1533 */
1534 function checkMtimeTstamp($mtime,$phash) {
1535
1536 // Select indexed page:
1537 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
1538 $out = 0;
1539
1540 // If there was an indexing of the page...:
1541 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1542 if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time()) { // If max age is exceeded, index the page
1543 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed.
1544 } else {
1545 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time()) { // if minAge is not set or if minAge is exceeded, consider at mtime
1546 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1547 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index.
1548 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1549 } else {
1550 $out = -1; // mtime matched the document, so no changes detected and no content updated
1551 if ($this->tstamp_maxAge) {
1552 $this->log_setTSlogMessage('Mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
1553 } else {
1554 $this->updateTstamp($phash); // Update the timestatmp
1555 $this->log_setTSlogMessage('Mtime matched, timestamp updated.',1);
1556 }
1557 }
1558 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed.
1559 } else {$out = -2;} // The minimum age was not exceeded
1560 }
1561 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table).
1562 return $out;
1563 }
1564
1565 /**
1566 * Check content hash in phash table
1567 *
1568 * @return mixed Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1569 */
1570 function checkContentHash() {
1571 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1572 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
1573 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1574 return $row;
1575 }
1576 return 1;
1577 }
1578
1579 /**
1580 * Check content hash for external documents
1581 * Returns true if the document needs to be indexed (that is, there was no result)
1582 *
1583 * @param integer phash value to check (phash_grouping)
1584 * @param integer Content hash to check
1585 * @return boolean Returns true if the document needs to be indexed (that is, there was no result)
1586 */
1587 function checkExternalDocContentHash($hashGr,$content_md5h) {
1588 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
1589 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1590 return 0;
1591 }
1592 return 1;
1593 }
1594
1595 /**
1596 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1597 *
1598 * @param integer Phash integer to test.
1599 * @return void
1600 */
1601 function is_grlist_set($phash_x) {
1602 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
1603 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
1604 }
1605
1606 /**
1607 * Check if an grlist-entry for this hash exists and if not so, write one.
1608 *
1609 * @param integer phash of the search result that should be found
1610 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1611 * @return void
1612 * @see submit_grlist()
1613 */
1614 function update_grlist($phash,$phash_x) {
1615 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
1616 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1617 $this->submit_grlist($phash,$phash_x);
1618 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
1619 }
1620 }
1621
1622 /**
1623 * Update tstamp for a phash row.
1624 *
1625 * @param integer phash value
1626 * @param integer If set, update the mtime field to this value.
1627 * @return void
1628 */
1629 function updateTstamp($phash,$mtime=0) {
1630 $updateFields = array(
1631 'tstamp' => time()
1632 );
1633 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
1634
1635 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1636 }
1637
1638 /**
1639 * Update parsetime for phash row.
1640 *
1641 * @param integer phash value.
1642 * @param integer Parsetime value to set.
1643 * @return void
1644 */
1645 function updateParsetime($phash,$parsetime) {
1646 $updateFields = array(
1647 'parsetime' => intval($parsetime)
1648 );
1649
1650 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1651 }
1652
1653 /**
1654 * Update section rootline for the page
1655 *
1656 * @return void
1657 */
1658 function updateRootline() {
1659
1660 $updateFields = array();
1661 $this->getRootLineFields($updateFields);
1662
1663 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
1664 }
1665
1666 /**
1667 * Adding values for root-line fields.
1668 * rl0, rl1 and rl2 are standard. A hook might add more.
1669 *
1670 * @param array Field array, passed by reference
1671 * @return void
1672 */
1673 function getRootLineFields(&$fieldArr) {
1674
1675 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
1676 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
1677 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
1678
1679 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1680 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1681 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1682 }
1683 }
1684 }
1685
1686 /**
1687 * Removes any indexed pages with userlogins which has the same contentHash
1688 * NOT USED anywhere inside this class!
1689 *
1690 * @return void
1691 */
1692 function removeLoginpagesWithContentHash() {
1693 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
1694 A.phash=B.phash
1695 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
1696 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
1697 AND A.contentHash='.intval($this->content_md5h));
1698 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1699 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
1700 $this->removeOldIndexedPages($row['phash']);
1701 }
1702 }
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715 /********************************
1716 *
1717 * SQL; Submitting words
1718 *
1719 *******************************/
1720
1721 /**
1722 * Adds new words to db
1723 *
1724 * @param array Word List array (where each word has information about position etc).
1725 * @return void
1726 */
1727 function checkWordList($wl) {
1728 reset($wl);
1729 $phashArr = array();
1730 while(list($key,) = each($wl)) {
1731 $phashArr[] = $wl[$key]['hash'];
1732 }
1733 if (count($phashArr)) {
1734 $cwl = implode(',',$phashArr);
1735 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
1736
1737 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
1738 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
1739 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1740 unset($wl[$row['baseword']]);
1741 }
1742
1743 reset($wl);
1744 while(list($key,$val)=each($wl)) {
1745 $insertFields = array(
1746 'wid' => $val['hash'],
1747 'baseword' => $key,
1748 'metaphone' => $val['metaphone']
1749 );
1750 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1751 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1752 }
1753 }
1754 }
1755 }
1756
1757 /**
1758 * Submits RELATIONS between words and phash
1759 *
1760 * @param array Word list array
1761 * @param integer phash value
1762 * @return void
1763 */
1764 function submitWords($wl,$phash) {
1765 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
1766
1767 foreach($wl as $val) {
1768 $insertFields = array(
1769 'phash' => $phash,
1770 'wid' => $val['hash'],
1771 'count' => $val['count'],
1772 'first' => $val['first'],
1773 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
1774 'flags' => ($val['cmp'] & $this->flagBitMask)
1775 );
1776
1777 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
1778 }
1779 }
1780
1781 /**
1782 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1783 * and back.
1784 *
1785 * @param double Frequency
1786 * @return integer Frequency in range.
1787 */
1788 function freqMap($freq) {
1789 $mapFactor = $this->freqMax*100*$this->freqRange;
1790 if($freq<1) {
1791 $newFreq = $freq*$mapFactor;
1792 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
1793 } else {
1794 $newFreq = $freq/$mapFactor;
1795 }
1796 return $newFreq;
1797
1798 }
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810 /********************************
1811 *
1812 * Hashing
1813 *
1814 *******************************/
1815
1816 /**
1817 * Get search hash, T3 pages
1818 *
1819 * @return void
1820 */
1821 function setT3Hashes() {
1822
1823 // Set main array:
1824 $hArray = array(
1825 'id' => (integer)$this->conf['id'],
1826 'type' => (integer)$this->conf['type'],
1827 'sys_lang' => (integer)$this->conf['sys_language_uid'],
1828 'MP' => (string)$this->conf['MP'],
1829 'cHash' => $this->cHashParams
1830 );
1831
1832 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1833 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1834
1835 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1836 $hArray['gr_list'] = (string)$this->conf['gr_list'];
1837 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
1838 }
1839
1840 /**
1841 * Get search hash, external files
1842 *
1843 * @param string File name / path which identifies it on the server
1844 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
1845 * @return array Array with "phash_grouping" and "phash" inside.
1846 */
1847 function setExtHashes($file,$subinfo=array()) {
1848 // Set main array:
1849 $hash = array();
1850 $hArray = array(
1851 'file' => $file,
1852 );
1853
1854 // Set grouping hash:
1855 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1856
1857 // Add subinfo
1858 $hArray['subinfo'] = $subinfo;
1859 $hash['phash'] = $this->md5inthash(serialize($hArray));
1860
1861 return $hash;
1862 }
1863
1864 /**
1865 * md5 integer hash
1866 * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
1867 *
1868 * @param string String to hash
1869 * @return integer Integer intepretation of the md5 hash of input string.
1870 */
1871 function md5inthash($str) {
1872 return hexdec(substr(md5($str),0,7));
1873 }
1874
1875 /**
1876 * Calculates the cHash value of input GET array (for constructing cHash values if needed)
1877 *
1878 * @param array Array of GET parameters to encode
1879 * @return void
1880 */
1881 function makeCHash($paramArray) {
1882 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
1883
1884 $pA = t3lib_div::cHashParams($addQueryParams);
1885
1886 return t3lib_div::shortMD5(serialize($pA));
1887 }
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900 /*********************************
1901 *
1902 * Internal logging functions
1903 *
1904 *********************************/
1905
1906 /**
1907 * Push function wrapper for TT logging
1908 *
1909 * @param string Title to set
1910 * @param string Key (?)
1911 * @return void
1912 */
1913 function log_push($msg,$key) {
1914 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
1915 }
1916
1917 /**
1918 * Pull function wrapper for TT logging
1919 *
1920 * @return void
1921 */
1922 function log_pull() {
1923 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
1924 }
1925
1926 /**
1927 * Set log message function wrapper for TT logging
1928 *
1929 * @param string Message to set
1930 * @param integer Error number
1931 * @return void
1932 */
1933 function log_setTSlogMessage($msg, $errorNum=0) {
1934 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
1935 $this->internal_log[] = $msg;
1936 }
1937
1938
1939
1940
1941
1942
1943
1944
1945 /**************************
1946 *
1947 * tslib_fe hooks:
1948 *
1949 **************************/
1950
1951 /**
1952 * Frontend hook: If the page is not being re-generated this is our chance to force it to be (because re-generation of the page is required in order to have the indexer called!)
1953 *
1954 * @param array Parameters from frontend
1955 * @param object TSFE object (reference under PHP5)
1956 * @return void
1957 */
1958 function fe_headerNoCache(&$params, $ref) {
1959
1960 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
1961 if (t3lib_extMgm::isLoaded('crawler')
1962 && $params['pObj']->applicationData['tx_crawler']['running']
1963 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions'])) {
1964
1965 // Setting simple log entry:
1966 $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData'];
1967
1968 // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached.
1969 $params['disableAcquireCacheData'] = TRUE;
1970 }
1971 }
1972 }
1973
1974
1975 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
1976 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
1977 }
1978 ?>