Indexed Search modifications for support of cronjob based indexing. More to come...
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 138: class tx_indexedsearch_indexer
39 * 204: function hook_indexContent(&$pObj)
40 *
41 * SECTION: Backend API
42 * 303: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
43 * 340: function backend_setFreeIndexUid($freeIndexUid)
44 * 357: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
45 *
46 * SECTION: Initialization
47 * 408: function init()
48 * 459: function initializeExternalParsers()
49 *
50 * SECTION: Indexing; TYPO3 pages (HTML content)
51 * 500: function indexTypo3PageContent()
52 * 586: function splitHTMLContent($content)
53 * 632: function getHTMLcharset($content)
54 * 647: function convertHTMLToUtf8($content,$charset='')
55 * 675: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
56 * 702: function typoSearchTags(&$body)
57 * 731: function extractLinks($content)
58 * 774: function extractHyperLinks($string)
59 *
60 * SECTION: Indexing; external URL
61 * 826: function indexExternalUrl($externalUrl)
62 * 857: function getUrlHeaders($url, $timeout = 2)
63 *
64 * SECTION: Indexing; external files (PDF, DOC, etc)
65 * 917: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
66 * 1023: function readFileContent($ext,$absFile,$cPKey)
67 * 1040: function fileContentParts($ext,$absFile)
68 * 1058: function splitRegularContent($content)
69 *
70 * SECTION: Analysing content, Extracting words
71 * 1091: function charsetEntity2utf8(&$contentArr, $charset)
72 * 1114: function procesWordsInArrays($contentArr)
73 * 1137: function bodyDescription($contentArr)
74 * 1159: function indexAnalyze($content)
75 * 1180: function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
76 * 1199: function analyzeBody(&$retArr,$content)
77 * 1219: function metaphone($word,$retRaw=FALSE)
78 *
79 * SECTION: SQL; TYPO3 Pages
80 * 1261: function submitPage()
81 * 1330: function submit_grlist($hash,$phash_x)
82 * 1350: function submit_section($hash,$hash_t3)
83 * 1368: function removeOldIndexedPages($phash)
84 *
85 * SECTION: SQL; External media
86 * 1411: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
87 * 1473: function submitFile_grlist($hash)
88 * 1487: function submitFile_section($hash)
89 * 1501: function removeOldIndexedFiles($phash)
90 *
91 * SECTION: SQL Helper functions
92 * 1537: function checkMtimeTstamp($mtime,$phash)
93 * 1573: function checkContentHash()
94 * 1590: function checkExternalDocContentHash($hashGr,$content_md5h)
95 * 1604: function is_grlist_set($phash_x)
96 * 1617: function update_grlist($phash,$phash_x)
97 * 1632: function updateTstamp($phash,$mtime=0)
98 * 1648: function updateParsetime($phash,$parsetime)
99 * 1661: function updateRootline()
100 * 1676: function getRootLineFields(&$fieldArr)
101 * 1695: function removeLoginpagesWithContentHash()
102 *
103 * SECTION: SQL; Submitting words
104 * 1730: function checkWordList($wl)
105 * 1767: function submitWords($wl,$phash)
106 * 1791: function freqMap($freq)
107 *
108 * SECTION: Hashing
109 * 1824: function setT3Hashes()
110 * 1850: function setExtHashes($file,$subinfo=array())
111 * 1874: function md5inthash($str)
112 * 1884: function makeCHash($paramArray)
113 *
114 * SECTION: Internal logging functions
115 * 1916: function log_push($msg,$key)
116 * 1925: function log_pull()
117 * 1936: function log_setTSlogMessage($msg, $errorNum=0)
118 *
119 * SECTION: tslib_fe hooks:
120 * 1961: function fe_headerNoCache(&$params, $ref)
121 *
122 * TOTAL FUNCTIONS: 56
123 * (This index is automatically created/updated by the extension "extdeveval")
124 *
125 */
126
127
128 require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
129
130
131 /**
132 * Indexing class for TYPO3 frontend
133 *
134 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
135 * @package TYPO3
136 * @subpackage tx_indexedsearch
137 */
138 class tx_indexedsearch_indexer {
139
140 // Messages:
141 var $reasons = array(
142 -1 => 'mtime matched the document, so no changes detected and no content updated',
143 -2 => 'The minimum age was not exceeded',
144 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
145 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
146 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
147 4 => 'Page has never been indexed (is not represented in the index_phash table).'
148 );
149
150 // HTML code blocks to exclude from indexing:
151 var $excludeSections = 'script,style';
152
153 // Supported Extensions for external files:
154 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
155
156 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
157 var $defaultGrList = '0,-1';
158
159 // Min/Max times:
160 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
161 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
162 var $maxExternalFiles = 0; // Max number of external files to index.
163
164 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc.
165 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
166
167 // INTERNALS:
168 var $defaultContentArray=array(
169 'title' => '',
170 'description' => '',
171 'keywords' => '',
172 'body' => '',
173 );
174 var $wordcount = 0;
175 var $externalFileCounter = 0;
176
177 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
178 var $indexerConfig = array(); // Indexer configuration
179 var $hash = array(); // Hash array, contains phash and phash_grouping
180 var $file_phash_arr = array(); // Hash array for files
181 var $contentParts = array(); // Content of TYPO3 page
182 var $content_md5h = '';
183 var $internal_log = array(); // Internal log
184 var $indexExternalUrl_content = '';
185
186 var $cHashParams = array(); // cHashparams array
187
188 var $freqRange = 32000;
189 var $freqMax = 0.1;
190
191 // Objects:
192 var $csObj; // Charset class object , t3lib_cs
193 var $metaphoneObj; // Metaphone object, if any
194 var $lexerObj; // Lexer object for word splitting
195
196
197
198 /**
199 * Parent Object (TSFE) Initialization
200 *
201 * @param object Parent Object (frontend TSFE object), passed by reference
202 * @return void
203 */
204 function hook_indexContent(&$pObj) {
205
206 // Indexer configuration from Extension Manager interface:
207 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
208
209 // Crawler activation:
210 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
211 if (t3lib_extMgm::isLoaded('crawler')
212 && $pObj->applicationData['tx_crawler']['running']
213 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
214
215 // Setting simple log message:
216 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
217
218 // Setting variables:
219 $this->crawlerActive = TRUE; // Crawler active flag
220 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
221 }
222
223 // Determine if page should be indexed, and if so, configure and initialize indexer
224 if ($pObj->config['config']['index_enable']) {
225 $this->log_push('Index page','');
226
227 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
228 if (!$pObj->page['no_search']) {
229 if (!$pObj->no_cache) {
230
231 // Setting up internal configuration from config array:
232 $this->conf = array();
233
234 // Information about page for which the indexing takes place
235 $this->conf['id'] = $pObj->id; // Page id
236 $this->conf['type'] = $pObj->type; // Page type
237 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
238 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
239 $this->conf['gr_list'] = $pObj->gr_list; // Group list
240
241 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
242 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
243
244 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
245 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
246
247 // Root line uids
248 $this->conf['rootline_uids'] = array();
249 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
250 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
251 }
252
253 // Content of page:
254 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
255 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
256 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
257 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
258
259 // Configuration of behavior:
260 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
261 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
262
263 // Set to zero:
264 $this->conf['recordUid'] = 0;
265 $this->conf['freeIndexUid'] = 0;
266 $this->conf['freeIndexSetId'] = 0;
267
268 // Init and start indexing:
269 $this->init();
270 $this->indexTypo3PageContent();
271
272 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
273 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page header!');
274 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
275 $this->log_pull();
276 }
277 }
278
279
280
281
282
283
284
285
286 /****************************
287 *
288 * Backend API
289 *
290 ****************************/
291
292 /**
293 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
294 *
295 * @param integer The page uid, &id=
296 * @param integer The page type, &type=
297 * @param integer sys_language uid, typically &L=
298 * @param string The MP variable (Mount Points), &MP=
299 * @param array Rootline array of only UIDs.
300 * @param array Array of GET variables to register with this indexing
301 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
302 * @return void
303 */
304 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
305
306 // Setting up internal configuration from config array:
307 $this->conf = array();
308
309 // Information about page for which the indexing takes place
310 $this->conf['id'] = $id; // Page id (integer)
311 $this->conf['type'] = $type; // Page type (integer)
312 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
313 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
314 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
315
316 // cHash values:
317 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : ''; // cHash string for additional parameters
318 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
319
320 // Set to defaults
321 $this->conf['freeIndexUid'] = 0;
322 $this->conf['freeIndexSetId'] = 0;
323 $this->conf['page_cache_reg1'] = '';
324
325 // Root line uids
326 $this->conf['rootline_uids'] = $uidRL;
327
328 // Configuration of behavior:
329 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
330 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
331
332 // Init and start indexing:
333 $this->init();
334 }
335
336 /**
337 * Sets the free-index uid. Can be called right after backend_initIndexer()
338 *
339 * @param integer Free index UID
340 * @return void
341 */
342 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
343 $this->conf['freeIndexUid'] = $freeIndexUid;
344 $this->conf['freeIndexSetId'] = $freeIndexSetId;
345 }
346
347 /**
348 * Indexing records as the content of a TYPO3 page.
349 *
350 * @param string Title equivalent
351 * @param string Keywords equivalent
352 * @param string Description equivalent
353 * @param string The main content to index
354 * @param string The charset of the title, keyword, description and body-content
355 * @param integer Last modification time, in seconds
356 * @param integer The creation date of the content, in seconds
357 * @param integer The record UID that the content comes from (for registration with the indexed rows)
358 * @return void
359 */
360 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
361
362 // Content of page:
363 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
364 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
365 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
366
367 // Construct fake HTML for parsing:
368 $this->conf['content'] = '
369 <html>
370 <head>
371 <title>'.htmlspecialchars($title).'</title>
372 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
373 <meta name="description" content="'.htmlspecialchars($description).'" />
374 </head>
375 <body>
376 '.htmlspecialchars($content).'
377 </body>
378 </html>'; // Content string (HTML of TYPO3 page)
379
380 // Initializing charset:
381 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
382 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
383
384 // Index content as if it was a TYPO3 page:
385 $this->indexTypo3PageContent();
386 }
387
388
389
390
391
392
393
394
395
396
397
398
399
400 /********************************
401 *
402 * Initialization
403 *
404 *******************************/
405
406 /**
407 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
408 *
409 * @return void
410 */
411 function init() {
412 global $TYPO3_CONF_VARS;
413
414 // Initializing:
415 $this->cHashParams = $this->conf['cHash_array'];
416 if (is_array($this->cHashParams) && count($this->cHashParams)) {
417 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
418 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
419 }
420
421 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
422 $this->setT3Hashes();
423
424 // Indexer configuration from Extension Manager interface:
425 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
426 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
427 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
428 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
429 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
430
431 // Initialize external document parsers:
432 // Example configuration, see ext_localconf.php of this file!
433 if ($this->conf['index_externals']) {
434 $this->initializeExternalParsers();
435 }
436
437 // Initialize lexer (class that deconstructs the text into words):
438 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
439 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
440 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
441 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
442 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
443 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
444
445 // Initialize metaphone hook:
446 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
447 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
448 $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
449 $this->metaphoneObj->pObj = &$this;
450 }
451
452 // Init charset class:
453 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
454 }
455
456 /**
457 * Initialize external parsers
458 *
459 * @return void
460 * @access private
461 * @see init()
462 */
463 function initializeExternalParsers() {
464 global $TYPO3_CONF_VARS;
465
466 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
467 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
468 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
469 $this->external_parsers[$extension]->pObj = &$this;
470
471 // Init parser and if it returns false, unset its entry again:
472 if (!$this->external_parsers[$extension]->initParser($extension)) {
473 unset($this->external_parsers[$extension]);
474 }
475 }
476 }
477 }
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493 /********************************
494 *
495 * Indexing; TYPO3 pages (HTML content)
496 *
497 *******************************/
498
499 /**
500 * Start indexing of the TYPO3 page
501 *
502 * @return void
503 */
504 function indexTypo3PageContent() {
505
506 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
507 $is_grlist = $this->is_grlist_set($this->hash['phash']);
508
509 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
510
511 // Setting message:
512 if ($this->forceIndexing) {
513 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
514 } elseif ($check > 0) {
515 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
516 } else {
517 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
518 }
519
520 // Divide into title,keywords,description and body:
521 $this->log_push('Split content','');
522 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
523 if ($this->conf['indexedDocTitle']) {
524 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
525 }
526 $this->log_pull();
527
528 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
529 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
530
531 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
532 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
533 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
534 $checkCHash = $this->checkContentHash();
535 if (!is_array($checkCHash) || $check===1) {
536 $Pstart=t3lib_div::milliseconds();
537
538 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
539 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
540 $this->log_pull();
541
542 // Splitting words
543 $this->log_push('Extract words from content','');
544 $splitInWords = $this->procesWordsInArrays($this->contentParts);
545 $this->log_pull();
546
547 // Analyse the indexed words.
548 $this->log_push('Analyse the extracted words','');
549 $indexArr = $this->indexAnalyze($splitInWords);
550 $this->log_pull();
551
552 // Submitting page (phash) record
553 $this->log_push('Submitting page','');
554 $this->submitPage();
555 $this->log_pull();
556
557 // Check words and submit to word list if not there
558 $this->log_push('Check word list and submit words','');
559 $this->checkWordList($indexArr);
560 $this->submitWords($indexArr,$this->hash['phash']);
561 $this->log_pull();
562
563 // Set parsetime
564 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
565
566 // Checking external files if configured for.
567 $this->log_push('Checking external files','');
568 if ($this->conf['index_externals']) {
569 $this->extractLinks($this->conf['content']);
570 }
571 $this->log_pull();
572 } else {
573 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
574 $this->updateSetId($this->hash['phash']);
575 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
576 $this->updateRootline();
577 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
578 }
579 } else {
580 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
581 }
582 }
583
584 /**
585 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
586 *
587 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
588 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
589 * @see splitRegularContent()
590 */
591 function splitHTMLContent($content) {
592
593 // divide head from body ( u-ouh :) )
594 $contentArr = $this->defaultContentArray;
595 $contentArr['body'] = stristr($content,'<body');
596 $headPart = substr($content,0,-strlen($contentArr['body']));
597
598 // get title
599 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
600 $titleParts = explode(':',$contentArr['title'],2);
601 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
602
603 // get keywords and description metatags
604 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
605 for($i=0;isset($meta[$i]);$i++) {
606 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
607 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
608 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
609 }
610
611 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
612 $this->typoSearchTags($contentArr['body']);
613
614 // Get rid of unwanted sections (ie. scripting and style stuff) in body
615 $tagList = explode(',',$this->excludeSections);
616 foreach($tagList as $tag) {
617 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
618 }
619
620 // remove tags, but first make sure we don't concatenate words by doing it
621 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
622 $contentArr['body'] = trim(strip_tags($contentArr['body']));
623
624 $contentArr['keywords'] = trim($contentArr['keywords']);
625 $contentArr['description'] = trim($contentArr['description']);
626
627 // Return array
628 return $contentArr;
629 }
630
631 /**
632 * Extract the charset value from HTML meta tag.
633 *
634 * @param string HTML content
635 * @return string The charset value if found.
636 */
637 function getHTMLcharset($content) {
638 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg)) {
639 if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2)) {
640 return $reg2[1];
641 }
642 }
643 }
644
645 /**
646 * Converts a HTML document to utf-8
647 *
648 * @param string HTML content, any charset
649 * @param string Optional charset (otherwise extracted from HTML)
650 * @return string Converted HTML
651 */
652 function convertHTMLToUtf8($content,$charset='') {
653
654 // Find charset:
655 $charset = $charset ? $charset : $this->getHTMLcharset($content);
656 $charset = $this->csObj->parse_charset($charset);
657
658 // Convert charset:
659 if ($charset && $charset!=='utf-8') {
660 $content = $this->csObj->utf8_encode($content, $charset);
661 }
662 // Convert entities, assuming document is now UTF-8:
663 $content = $this->csObj->entities_to_utf8($content, TRUE);
664
665 return $content;
666 }
667
668 /**
669 * Finds first occurence of embracing tags and returns the embraced content and the original string with
670 * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
671 * <title> of document or removing <script>-sections
672 *
673 * @param string String to search in
674 * @param string Tag name, eg. "script"
675 * @param string Passed by reference: Content inside found tag
676 * @param string Passed by reference: Content after found tag
677 * @param string Passed by reference: Attributes of the found tag.
678 * @return boolean Returns false if tag was not found, otherwise true.
679 */
680 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
681 $endTag = '</'.$tagName.'>';
682 $startTag = '<'.$tagName;
683
684 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
685 if(!$isTagInText) return false; // if the tag was not found, return false
686
687 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
688 $afterTagInText = stristr($isTagInText,$endTag);
689 if ($afterTagInText) {
690 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
691 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
692 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
693 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
694 $tagContent='';
695 $stringAfter = $isTagInText;
696 }
697
698 return true;
699 }
700
701 /**
702 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
703 *
704 * @param string HTML Content, passed by reference
705 * @return boolean Returns true if a TYPOSEARCH_ tag was found, otherwise false.
706 */
707 function typoSearchTags(&$body) {
708 $expBody = explode('<!--TYPO3SEARCH_',$body);
709
710 if(count($expBody)>1) {
711 $body = '';
712
713 foreach($expBody as $val) {
714 $part = explode('-->',$val,2);
715 if(trim($part[0])=='begin') {
716 $body.= $part[1];
717 $prev = '';
718 } elseif(trim($part[0])=='end') {
719 $body.= $prev;
720 } else {
721 $prev = $val;
722 }
723 }
724 return true;
725 } else {
726 return false;
727 }
728 }
729
730 /**
731 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
732 *
733 * @param string HTML content
734 * @return void
735 */
736 function extractLinks($content) {
737
738 // Get links:
739 $list = $this->extractHyperLinks($content);
740
741 // Traverse links:
742 foreach($list as $linkInfo) {
743
744 // Decode entities:
745 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
746
747 // Parse URL:
748 $qParts = parse_url($linkSource);
749
750 // Check for jumpurl (TYPO3 specific thing...)
751 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
752 parse_str($qParts['query'],$getP);
753 $linkSource = $getP['jumpurl'];
754 $qParts = parse_url($linkSource); // parse again due to new linkSource!
755 }
756
757 if ($qParts['scheme']) {
758 if ($this->indexerConfig['indexExternalURLs']) {
759 // Index external URL (http or otherwise)
760 $this->indexExternalUrl($linkSource);
761 }
762 } elseif (!$qParts['query']) {
763 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
764 if ($localFile && @is_file($localFile)) {
765 // Index local file:
766 $this->indexRegularDocument($linkSource);
767 }
768 }
769 }
770 }
771
772 /**
773 * Extracts all links to external documents from content string.
774 *
775 * @param string Content to analyse
776 * @return array Array of hyperlinks
777 * @see extractLinks()
778 */
779 function extractHyperLinks($string) {
780 if (!is_object($this->htmlParser)) {
781 $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
782 }
783
784 $parts = $this->htmlParser->splitTags('a',$string);
785 $list = array();
786 foreach($parts as $k => $v) {
787 if ($k%2) {
788 $params = $this->htmlParser->get_tag_attributes($v,1);
789 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
790
791 switch(strtolower($firstTagName)) {
792 case 'a':
793 $src = $params[0]['href'];
794 if ($src) {
795 $list[] = array(
796 'tag' => $v,
797 'href' => $params[0]['href']
798 );
799 }
800 break;
801 }
802 }
803 }
804
805 return $list;
806 }
807
808
809
810
811
812
813
814
815
816
817
818 /******************************************
819 *
820 * Indexing; external URL
821 *
822 ******************************************/
823
824 /**
825 * Index External URLs HTML content
826 *
827 * @param string URL, eg. "http://typo3.org/"
828 * @return void
829 * @see indexRegularDocument()
830 */
831 function indexExternalUrl($externalUrl) {
832
833 // Parse External URL:
834 $qParts = parse_url($externalUrl);
835 $fI = pathinfo($qParts['path']);
836 $ext = strtolower($fI['extension']);
837
838 // Get headers:
839 $urlHeaders = $this->getUrlHeaders($externalUrl);
840 if (stristr($urlHeaders['Content-Type'],'text/html')) {
841 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
842 if (strlen($content)) {
843
844 // Create temporary file:
845 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
846 t3lib_div::writeFile($tmpFile, $content);
847
848 // Index that file:
849 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
850 unlink($tmpFile);
851 }
852 }
853 }
854
855 /**
856 * Getting HTTP request headers of URL
857 *
858 * @param string The URL
859 * @param integer Timeout (seconds?)
860 * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys
861 */
862 function getUrlHeaders($url, $timeout = 2) {
863 $url = parse_url($url);
864
865 if(!in_array($url['scheme'],array('','http'))) return FALSE;
866
867 $fp = fsockopen ($url['host'], ($url['port'] > 0 ? $url['port'] : 80), $errno, $errstr, $timeout);
868 if (!$fp) {
869 return FALSE;
870 } else {
871 $msg = "GET ".$url['path'].($url['query'] ? '?'.$url['query'] : '')." HTTP/1.0\r\nHost: ".$url['host']."\r\n\r\n";
872 fputs ($fp, $msg);
873 $d = '';
874 while (!feof($fp)) {
875 $line = fgets ($fp,2048);
876
877 $d.=$line;
878 if (!strlen(trim($line))) {
879 break;
880 }
881 }
882 fclose ($fp);
883
884 // Compile headers:
885 $headers = t3lib_div::trimExplode(chr(10),$d,1);
886 $retVal = array();
887 foreach($headers as $line) {
888 list($headKey, $headValue) = explode(':', $line, 2);
889 $retVal[$headKey] = $headValue;
890 }
891 return $retVal;
892 }
893 }
894
895
896
897
898
899
900
901
902
903
904
905
906
907 /******************************************
908 *
909 * Indexing; external files (PDF, DOC, etc)
910 *
911 ******************************************/
912
913 /**
914 * Indexing a regular document given as $file (relative to PATH_site, local file)
915 *
916 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
917 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
918 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
919 * @param string File extension for temporary file.
920 * @return void
921 */
922 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
923
924 // Init
925 $fI = pathinfo($file);
926 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
927
928 // Create abs-path:
929 if (!$contentTmpFile) {
930 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
931 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
932 } else { // Absolute, pass-through:
933 $absFile = $file;
934 }
935 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
936 } else {
937 $absFile = $contentTmpFile;
938 }
939
940 // Indexing the document:
941 if ($absFile && @is_file($absFile)) {
942 if ($this->external_parsers[$ext]) {
943 $mtime = filemtime($absFile);
944 $cParts = $this->fileContentParts($ext,$absFile);
945
946 foreach($cParts as $cPKey) {
947 $this->internal_log = array();
948 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
949 $Pstart = t3lib_div::milliseconds();
950 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
951 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
952 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
953 if ($check > 0 || $force) {
954 if ($check > 0) {
955 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
956 } else {
957 $this->log_setTSlogMessage('Indexing forced by flag',1);
958 }
959
960 // Check external file counter:
961 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
962
963 // Divide into title,keywords,description and body:
964 $this->log_push('Split content','');
965 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
966 $this->log_pull();
967
968 if (is_array($contentParts)) {
969 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
970 $content_md5h = $this->md5inthash(implode($contentParts,''));
971
972 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
973
974 // Increment counter:
975 $this->externalFileCounter++;
976
977 // Splitting words
978 $this->log_push('Extract words from content','');
979 $splitInWords = $this->procesWordsInArrays($contentParts);
980 $this->log_pull();
981
982 // Analyse the indexed words.
983 $this->log_push('Analyse the extracted words','');
984 $indexArr = $this->indexAnalyze($splitInWords);
985 $this->log_pull();
986
987 // Submitting page (phash) record
988 $this->log_push('Submitting page','');
989 $size = filesize($absFile);
990 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
991 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
992 $this->log_pull();
993
994 // Check words and submit to word list if not there
995 $this->log_push('Check word list and submit words','');
996 $this->checkWordList($indexArr);
997 $this->submitWords($indexArr,$phash_arr['phash']);
998 $this->log_pull();
999
1000 // Set parsetime
1001 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
1002 } else {
1003 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
1004 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
1005 }
1006 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1007 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1008 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1009
1010 // Checking and setting sections:
1011 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
1012 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1013 $this->log_pull();
1014 }
1015 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1016 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1017 }
1018
1019 /**
1020 * Reads the content of an external file being indexed.
1021 * The content from the external parser MUST be returned in utf-8!
1022 *
1023 * @param string File extension, eg. "pdf", "doc" etc.
1024 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1025 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1026 * @return array Standard content array (title, description, keywords, body keys)
1027 */
1028 function readFileContent($ext,$absFile,$cPKey) {
1029
1030 // Consult relevant external document parser:
1031 if (is_object($this->external_parsers[$ext])) {
1032 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
1033 }
1034
1035 return $contentArr;
1036 }
1037
1038 /**
1039 * Creates an array with pointers to divisions of document.
1040 *
1041 * @param string File extension
1042 * @param string Absolute filename (must exist and be validated OK before calling function)
1043 * @return array Array of pointers to sections that the document should be divided into
1044 */
1045 function fileContentParts($ext,$absFile) {
1046 $cParts = array(0);
1047
1048 // Consult relevant external document parser:
1049 if (is_object($this->external_parsers[$ext])) {
1050 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
1051 }
1052
1053 return $cParts;
1054 }
1055
1056 /**
1057 * Splits non-HTML content (from external files for instance)
1058 *
1059 * @param string Input content (non-HTML) to index.
1060 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1061 * @see splitHTMLContent()
1062 */
1063 function splitRegularContent($content) {
1064 $contentArr = $this->defaultContentArray;
1065 $contentArr['body'] = $content;
1066
1067 return $contentArr;
1068 }
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083 /**********************************
1084 *
1085 * Analysing content, Extracting words
1086 *
1087 **********************************/
1088
1089 /**
1090 * Convert character set and HTML entities in the value of input content array keys
1091 *
1092 * @param array Standard content array
1093 * @param string Charset of the input content (converted to utf-8)
1094 * @return void
1095 */
1096 function charsetEntity2utf8(&$contentArr, $charset) {
1097
1098 // Convert charset if necessary
1099 reset($contentArr);
1100 while(list($key,)=each($contentArr)) {
1101 if (strlen($contentArr[$key])) {
1102
1103 if ($charset!=='utf-8') {
1104 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1105 }
1106
1107 // decode all numeric / html-entities in the string to real characters:
1108 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
1109 }
1110 }
1111 }
1112
1113 /**
1114 * Processing words in the array from split*Content -functions
1115 *
1116 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1117 * @return array Content input array modified so each key is not a unique array of words
1118 */
1119 function procesWordsInArrays($contentArr) {
1120
1121 // split all parts to words
1122 reset($contentArr);
1123 while(list($key,)=each($contentArr)) {
1124 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1125 }
1126
1127 // For title, keywords, and description we don't want duplicates:
1128 $contentArr['title'] = array_unique($contentArr['title']);
1129 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1130 $contentArr['description'] = array_unique($contentArr['description']);
1131
1132 // Return modified array:
1133 return $contentArr;
1134 }
1135
1136 /**
1137 * Extracts the sample description text from the content array.
1138 *
1139 * @param array Content array
1140 * @return string Description string
1141 */
1142 function bodyDescription($contentArr) {
1143
1144 // Setting description
1145 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
1146 if ($maxL) {
1147 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
1148 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
1149 $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
1150
1151 // Shorten the string:
1152 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1153 }
1154
1155 return $bodyDescription;
1156 }
1157
1158 /**
1159 * Analyzes content to use for indexing,
1160 *
1161 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1162 * @return array Index Array (whatever that is...)
1163 */
1164 function indexAnalyze($content) {
1165 $indexArr = Array();
1166 $counter = 0;
1167
1168 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
1169 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
1170 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
1171 $this->analyzeBody($indexArr,$content);
1172
1173 return ($indexArr);
1174 }
1175
1176 /**
1177 * Calculates relevant information for headercontent
1178 *
1179 * @param array Index array, passed by reference
1180 * @param array Standard content array
1181 * @param string Key from standard content array
1182 * @param integer Bit-wise priority to type
1183 * @return void
1184 */
1185 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
1186 reset($content[$key]);
1187 while(list(,$val)=each($content[$key])) {
1188 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1189 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
1190 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1191 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1192 $retArr[$val]['metaphone'] = $this->metaphone($val);
1193 $this->wordcount++;
1194 }
1195 }
1196
1197 /**
1198 * Calculates relevant information for bodycontent
1199 *
1200 * @param array Index array, passed by reference
1201 * @param array Standard content array
1202 * @return void
1203 */
1204 function analyzeBody(&$retArr,$content) {
1205 foreach($content['body'] as $key => $val) {
1206 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1207 if(!isset($retArr[$val])) {
1208 $retArr[$val]['first'] = $key;
1209 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1210 $retArr[$val]['metaphone'] = $this->metaphone($val);
1211 }
1212 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1213 $this->wordcount++;
1214 }
1215 }
1216
1217 /**
1218 * Creating metaphone based hash from input word
1219 *
1220 * @param string Word to convert
1221 * @param boolean If set, returns the raw metaphone value (not hashed)
1222 * @return mixed Metaphone hash integer (or raw value, string)
1223 */
1224 function metaphone($word,$retRaw=FALSE) {
1225
1226 if (is_object($this->metaphoneObj)) {
1227 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1228 } else {
1229 $tmp = metaphone($word);
1230 }
1231
1232 // Return raw value?
1233 if ($retRaw) return $tmp;
1234
1235 // Otherwise create hash and return integer
1236 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
1237 return $ret;
1238 }
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255 /********************************
1256 *
1257 * SQL; TYPO3 Pages
1258 *
1259 *******************************/
1260
1261 /**
1262 * Updates db with information about the page (TYPO3 page, not external media)
1263 *
1264 * @return void
1265 */
1266 function submitPage() {
1267
1268 // Remove any current data for this phash:
1269 $this->removeOldIndexedPages($this->hash['phash']);
1270
1271 // setting new phash_row
1272 $fields = array(
1273 'phash' => $this->hash['phash'],
1274 'phash_grouping' => $this->hash['phash_grouping'],
1275 'cHashParams' => serialize($this->cHashParams),
1276 'contentHash' => $this->content_md5h,
1277 'data_page_id' => $this->conf['id'],
1278 'data_page_reg1' => $this->conf['page_cache_reg1'],
1279 'data_page_type' => $this->conf['type'],
1280 'data_page_mp' => $this->conf['MP'],
1281 'gr_list' => $this->conf['gr_list'],
1282 'item_type' => 0, // TYPO3 page
1283 'item_title' => $this->contentParts['title'],
1284 'item_description' => $this->bodyDescription($this->contentParts),
1285 'item_mtime' => $this->conf['mtime'],
1286 'item_size' => strlen($this->conf['content']),
1287 'tstamp' => time(),
1288 'crdate' => time(),
1289 'item_crdate' => $this->conf['crdate'], // Creation date of page
1290 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1291 'externalUrl' => 0,
1292 'recordUid' => intval($this->conf['recordUid']),
1293 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1294 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1295 );
1296
1297 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1298
1299 // PROCESSING index_section
1300 $this->submit_section($this->hash['phash'],$this->hash['phash']);
1301
1302 // PROCESSING index_grlist
1303 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
1304
1305 // PROCESSING index_fulltext
1306 $fields = array(
1307 'phash' => $this->hash['phash'],
1308 'fulltextdata' => implode(' ', $this->contentParts)
1309 );
1310 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1311
1312 // PROCESSING index_debug
1313 if ($this->indexerConfig['debugMode']) {
1314 $fields = array(
1315 'phash' => $this->hash['phash'],
1316 'debuginfo' => serialize(array(
1317 'cHashParams' => $this->cHashParams,
1318 'external_parsers initialized' => array_keys($this->external_parsers),
1319 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
1320 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
1321 'logs' => $this->internal_log,
1322 'lexer' => $this->lexerObj->debugString,
1323 ))
1324 );
1325 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1326 }
1327 }
1328
1329 /**
1330 * Stores gr_list in the database.
1331 *
1332 * @param integer Search result record phash
1333 * @param integer Actual phash of current content
1334 * @return void
1335 * @see update_grlist()
1336 */
1337 function submit_grlist($hash,$phash_x) {
1338
1339 // Setting the gr_list record
1340 $fields = array(
1341 'phash' => $hash,
1342 'phash_x' => $phash_x,
1343 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
1344 'gr_list' => $this->conf['gr_list']
1345 );
1346 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1347 }
1348
1349 /**
1350 * Stores section
1351 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1352 *
1353 * @param integer phash of TYPO3 parent search result record
1354 * @param integer phash of the file indexation search record
1355 * @return void
1356 */
1357 function submit_section($hash,$hash_t3) {
1358 $fields = array(
1359 'phash' => $hash,
1360 'phash_t3' => $hash_t3,
1361 'page_id' => intval($this->conf['id'])
1362 );
1363
1364 $this->getRootLineFields($fields);
1365
1366 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1367 }
1368
1369 /**
1370 * Removes records for the indexed page, $phash
1371 *
1372 * @param integer phash value to flush
1373 * @return void
1374 */
1375 function removeOldIndexedPages($phash) {
1376 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1377 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
1378 foreach($tableArr as $table) {
1379 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1380 }
1381 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1382 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
1383 }
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397 /********************************
1398 *
1399 * SQL; External media
1400 *
1401 *******************************/
1402
1403
1404 /**
1405 * Updates db with information about the file
1406 *
1407 * @param array Array with phash and phash_grouping keys for file
1408 * @param string File name
1409 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1410 * @param string File extension determining the type of media.
1411 * @param integer Modification time of file.
1412 * @param integer Creation time of file.
1413 * @param integer Size of file in bytes
1414 * @param integer Content HASH value.
1415 * @param array Standard content array (using only title and body for a file)
1416 * @return void
1417 */
1418 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
1419
1420 // Find item Type:
1421 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1422 $storeItemType = $storeItemType ? $storeItemType : $ext;
1423
1424 // Remove any current data for this phash:
1425 $this->removeOldIndexedFiles($hash['phash']);
1426
1427 // Split filename:
1428 $fileParts = parse_url($file);
1429
1430 // setting new
1431 $fields = array(
1432 'phash' => $hash['phash'],
1433 'phash_grouping' => $hash['phash_grouping'],
1434 'cHashParams' => serialize($subinfo),
1435 'contentHash' => $content_md5h,
1436 'data_filename' => $file,
1437 'item_type' => $storeItemType,
1438 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1439 'item_description' => $this->bodyDescription($contentParts),
1440 'item_mtime' => $mtime,
1441 'item_size' => $size,
1442 'item_crdate' => $ctime,
1443 'tstamp' => time(),
1444 'crdate' => time(),
1445 'gr_list' => $this->conf['gr_list'],
1446 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1447 'recordUid' => intval($this->conf['recordUid']),
1448 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1449 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1450 );
1451 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1452
1453 // PROCESSING index_fulltext
1454 $fields = array(
1455 'phash' => $hash['phash'],
1456 'fulltextdata' => implode(' ', $contentParts)
1457 );
1458 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1459
1460 // PROCESSING index_debug
1461 if ($this->indexerConfig['debugMode']) {
1462 $fields = array(
1463 'phash' => $hash['phash'],
1464 'debuginfo' => serialize(array(
1465 'cHashParams' => $subinfo,
1466 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
1467 'logs' => $this->internal_log,
1468 'lexer' => $this->lexerObj->debugString,
1469 ))
1470 );
1471 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1472 }
1473 }
1474
1475 /**
1476 * Stores file gr_list for a file IF it does not exist already
1477 *
1478 * @param integer phash value of file
1479 * @return void
1480 */
1481 function submitFile_grlist($hash) {
1482 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1483 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
1484 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1485 $this->submit_grlist($hash,$hash);
1486 }
1487 }
1488
1489 /**
1490 * Stores file section for a file IF it does not exist
1491 *
1492 * @param integer phash value of file
1493 * @return void
1494 */
1495 function submitFile_section($hash) {
1496 // Testing if there is a section
1497 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
1498 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1499 $this->submit_section($hash,$this->hash['phash']);
1500 }
1501 }
1502
1503 /**
1504 * Removes records for the indexed page, $phash
1505 *
1506 * @param integer phash value to flush
1507 * @return void
1508 */
1509 function removeOldIndexedFiles($phash) {
1510
1511 // Removing old registrations for tables.
1512 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
1513 foreach($tableArr as $table) {
1514 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1515 }
1516 }
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531 /********************************
1532 *
1533 * SQL Helper functions
1534 *
1535 *******************************/
1536
1537 /**
1538 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1539 * Return positive integer if the page needs to being indexed!
1540 *
1541 * @param integer mtime value to test against limits and indexed page.
1542 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1543 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceed and so indexing cannot occur. -1) Mtimes matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1544 */
1545 function checkMtimeTstamp($mtime,$phash) {
1546
1547 // Select indexed page:
1548 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
1549 $out = 0;
1550
1551 // If there was an indexing of the page...:
1552 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1553 if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time()) { // If max age is exceeded, index the page
1554 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed.
1555 } else {
1556 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time()) { // if minAge is not set or if minAge is exceeded, consider at mtime
1557 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1558 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index.
1559 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1560 } else {
1561 $out = -1; // mtime matched the document, so no changes detected and no content updated
1562 if ($this->tstamp_maxAge) {
1563 $this->log_setTSlogMessage('Mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
1564 } else {
1565 $this->updateTstamp($phash); // Update the timestatmp
1566 $this->log_setTSlogMessage('Mtime matched, timestamp updated.',1);
1567 }
1568 }
1569 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed.
1570 } else {$out = -2;} // The minimum age was not exceeded
1571 }
1572 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table).
1573 return $out;
1574 }
1575
1576 /**
1577 * Check content hash in phash table
1578 *
1579 * @return mixed Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1580 */
1581 function checkContentHash() {
1582 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1583 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
1584 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1585 return $row;
1586 }
1587 return 1;
1588 }
1589
1590 /**
1591 * Check content hash for external documents
1592 * Returns true if the document needs to be indexed (that is, there was no result)
1593 *
1594 * @param integer phash value to check (phash_grouping)
1595 * @param integer Content hash to check
1596 * @return boolean Returns true if the document needs to be indexed (that is, there was no result)
1597 */
1598 function checkExternalDocContentHash($hashGr,$content_md5h) {
1599 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
1600 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1601 return 0;
1602 }
1603 return 1;
1604 }
1605
1606 /**
1607 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1608 *
1609 * @param integer Phash integer to test.
1610 * @return void
1611 */
1612 function is_grlist_set($phash_x) {
1613 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
1614 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
1615 }
1616
1617 /**
1618 * Check if an grlist-entry for this hash exists and if not so, write one.
1619 *
1620 * @param integer phash of the search result that should be found
1621 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1622 * @return void
1623 * @see submit_grlist()
1624 */
1625 function update_grlist($phash,$phash_x) {
1626 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
1627 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1628 $this->submit_grlist($phash,$phash_x);
1629 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
1630 }
1631 }
1632
1633 /**
1634 * Update tstamp for a phash row.
1635 *
1636 * @param integer phash value
1637 * @param integer If set, update the mtime field to this value.
1638 * @return void
1639 */
1640 function updateTstamp($phash,$mtime=0) {
1641 $updateFields = array(
1642 'tstamp' => time()
1643 );
1644 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
1645
1646 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1647 }
1648
1649 /**
1650 * Update SetID
1651 *
1652 * @param integer phash value
1653 * @return void
1654 */
1655 function updateSetId($phash) {
1656 $updateFields = array(
1657 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1658 );
1659
1660 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1661 }
1662
1663 /**
1664 * Update parsetime for phash row.
1665 *
1666 * @param integer phash value.
1667 * @param integer Parsetime value to set.
1668 * @return void
1669 */
1670 function updateParsetime($phash,$parsetime) {
1671 $updateFields = array(
1672 'parsetime' => intval($parsetime)
1673 );
1674
1675 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1676 }
1677
1678 /**
1679 * Update section rootline for the page
1680 *
1681 * @return void
1682 */
1683 function updateRootline() {
1684
1685 $updateFields = array();
1686 $this->getRootLineFields($updateFields);
1687
1688 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
1689 }
1690
1691 /**
1692 * Adding values for root-line fields.
1693 * rl0, rl1 and rl2 are standard. A hook might add more.
1694 *
1695 * @param array Field array, passed by reference
1696 * @return void
1697 */
1698 function getRootLineFields(&$fieldArr) {
1699
1700 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
1701 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
1702 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
1703
1704 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1705 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1706 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1707 }
1708 }
1709 }
1710
1711 /**
1712 * Removes any indexed pages with userlogins which has the same contentHash
1713 * NOT USED anywhere inside this class!
1714 *
1715 * @return void
1716 */
1717 function removeLoginpagesWithContentHash() {
1718 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
1719 A.phash=B.phash
1720 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
1721 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
1722 AND A.contentHash='.intval($this->content_md5h));
1723 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1724 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
1725 $this->removeOldIndexedPages($row['phash']);
1726 }
1727 }
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740 /********************************
1741 *
1742 * SQL; Submitting words
1743 *
1744 *******************************/
1745
1746 /**
1747 * Adds new words to db
1748 *
1749 * @param array Word List array (where each word has information about position etc).
1750 * @return void
1751 */
1752 function checkWordList($wl) {
1753 reset($wl);
1754 $phashArr = array();
1755 while(list($key,) = each($wl)) {
1756 $phashArr[] = $wl[$key]['hash'];
1757 }
1758 if (count($phashArr)) {
1759 $cwl = implode(',',$phashArr);
1760 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
1761
1762 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
1763 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
1764 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1765 unset($wl[$row['baseword']]);
1766 }
1767
1768 reset($wl);
1769 while(list($key,$val)=each($wl)) {
1770 $insertFields = array(
1771 'wid' => $val['hash'],
1772 'baseword' => $key,
1773 'metaphone' => $val['metaphone']
1774 );
1775 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1776 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1777 }
1778 }
1779 }
1780 }
1781
1782 /**
1783 * Submits RELATIONS between words and phash
1784 *
1785 * @param array Word list array
1786 * @param integer phash value
1787 * @return void
1788 */
1789 function submitWords($wl,$phash) {
1790 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
1791
1792 foreach($wl as $val) {
1793 $insertFields = array(
1794 'phash' => $phash,
1795 'wid' => $val['hash'],
1796 'count' => $val['count'],
1797 'first' => $val['first'],
1798 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
1799 'flags' => ($val['cmp'] & $this->flagBitMask)
1800 );
1801
1802 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
1803 }
1804 }
1805
1806 /**
1807 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1808 * and back.
1809 *
1810 * @param double Frequency
1811 * @return integer Frequency in range.
1812 */
1813 function freqMap($freq) {
1814 $mapFactor = $this->freqMax*100*$this->freqRange;
1815 if($freq<1) {
1816 $newFreq = $freq*$mapFactor;
1817 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
1818 } else {
1819 $newFreq = $freq/$mapFactor;
1820 }
1821 return $newFreq;
1822
1823 }
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835 /********************************
1836 *
1837 * Hashing
1838 *
1839 *******************************/
1840
1841 /**
1842 * Get search hash, T3 pages
1843 *
1844 * @return void
1845 */
1846 function setT3Hashes() {
1847
1848 // Set main array:
1849 $hArray = array(
1850 'id' => (integer)$this->conf['id'],
1851 'type' => (integer)$this->conf['type'],
1852 'sys_lang' => (integer)$this->conf['sys_language_uid'],
1853 'MP' => (string)$this->conf['MP'],
1854 'cHash' => $this->cHashParams
1855 );
1856
1857 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1858 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1859
1860 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1861 $hArray['gr_list'] = (string)$this->conf['gr_list'];
1862 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
1863 }
1864
1865 /**
1866 * Get search hash, external files
1867 *
1868 * @param string File name / path which identifies it on the server
1869 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
1870 * @return array Array with "phash_grouping" and "phash" inside.
1871 */
1872 function setExtHashes($file,$subinfo=array()) {
1873 // Set main array:
1874 $hash = array();
1875 $hArray = array(
1876 'file' => $file,
1877 );
1878
1879 // Set grouping hash:
1880 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1881
1882 // Add subinfo
1883 $hArray['subinfo'] = $subinfo;
1884 $hash['phash'] = $this->md5inthash(serialize($hArray));
1885
1886 return $hash;
1887 }
1888
1889 /**
1890 * md5 integer hash
1891 * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
1892 *
1893 * @param string String to hash
1894 * @return integer Integer intepretation of the md5 hash of input string.
1895 */
1896 function md5inthash($str) {
1897 return hexdec(substr(md5($str),0,7));
1898 }
1899
1900 /**
1901 * Calculates the cHash value of input GET array (for constructing cHash values if needed)
1902 *
1903 * @param array Array of GET parameters to encode
1904 * @return void
1905 */
1906 function makeCHash($paramArray) {
1907 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
1908
1909 $pA = t3lib_div::cHashParams($addQueryParams);
1910
1911 return t3lib_div::shortMD5(serialize($pA));
1912 }
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925 /*********************************
1926 *
1927 * Internal logging functions
1928 *
1929 *********************************/
1930
1931 /**
1932 * Push function wrapper for TT logging
1933 *
1934 * @param string Title to set
1935 * @param string Key (?)
1936 * @return void
1937 */
1938 function log_push($msg,$key) {
1939 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
1940 }
1941
1942 /**
1943 * Pull function wrapper for TT logging
1944 *
1945 * @return void
1946 */
1947 function log_pull() {
1948 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
1949 }
1950
1951 /**
1952 * Set log message function wrapper for TT logging
1953 *
1954 * @param string Message to set
1955 * @param integer Error number
1956 * @return void
1957 */
1958 function log_setTSlogMessage($msg, $errorNum=0) {
1959 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
1960 $this->internal_log[] = $msg;
1961 }
1962
1963
1964
1965
1966
1967
1968
1969
1970 /**************************
1971 *
1972 * tslib_fe hooks:
1973 *
1974 **************************/
1975
1976 /**
1977 * Frontend hook: If the page is not being re-generated this is our chance to force it to be (because re-generation of the page is required in order to have the indexer called!)
1978 *
1979 * @param array Parameters from frontend
1980 * @param object TSFE object (reference under PHP5)
1981 * @return void
1982 */
1983 function fe_headerNoCache(&$params, $ref) {
1984
1985 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
1986 if (t3lib_extMgm::isLoaded('crawler')
1987 && $params['pObj']->applicationData['tx_crawler']['running']
1988 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions'])) {
1989
1990 // Setting simple log entry:
1991 $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData'];
1992
1993 // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached.
1994 $params['disableAcquireCacheData'] = TRUE;
1995 }
1996 }
1997 }
1998
1999
2000 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
2001 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
2002 }
2003 ?>