99405a6dc867762b95badc7cf7ffcf306f881576
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 141: class tx_indexedsearch_indexer
39 * 207: function hook_indexContent(&$pObj)
40 *
41 * SECTION: Backend API
42 * 308: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
43 * 347: function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
44 * 365: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
45 *
46 * SECTION: Initialization
47 * 416: function init()
48 * 468: function initializeExternalParsers()
49 *
50 * SECTION: Indexing; TYPO3 pages (HTML content)
51 * 509: function indexTypo3PageContent()
52 * 596: function splitHTMLContent($content)
53 * 642: function getHTMLcharset($content)
54 * 657: function convertHTMLToUtf8($content,$charset='')
55 * 685: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
56 * 712: function typoSearchTags(&$body)
57 * 741: function extractLinks($content)
58 * 812: function extractHyperLinks($string)
59 *
60 * SECTION: Indexing; external URL
61 * 871: function indexExternalUrl($externalUrl)
62 * 902: function getUrlHeaders($url)
63 *
64 * SECTION: Indexing; external files (PDF, DOC, etc)
65 * 948: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
66 * 1054: function readFileContent($ext,$absFile,$cPKey)
67 * 1071: function fileContentParts($ext,$absFile)
68 * 1089: function splitRegularContent($content)
69 *
70 * SECTION: Analysing content, Extracting words
71 * 1122: function charsetEntity2utf8(&$contentArr, $charset)
72 * 1145: function processWordsInArrays($contentArr)
73 * 1170: function procesWordsInArrays($contentArr)
74 * 1180: function bodyDescription($contentArr)
75 * 1202: function indexAnalyze($content)
76 * 1223: function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
77 * 1242: function analyzeBody(&$retArr,$content)
78 * 1262: function metaphone($word,$retRaw=FALSE)
79 *
80 * SECTION: SQL; TYPO3 Pages
81 * 1304: function submitPage()
82 * 1378: function submit_grlist($hash,$phash_x)
83 * 1398: function submit_section($hash,$hash_t3)
84 * 1416: function removeOldIndexedPages($phash)
85 *
86 * SECTION: SQL; External media
87 * 1459: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
88 * 1525: function submitFile_grlist($hash)
89 * 1539: function submitFile_section($hash)
90 * 1553: function removeOldIndexedFiles($phash)
91 *
92 * SECTION: SQL Helper functions
93 * 1589: function checkMtimeTstamp($mtime,$phash)
94 * 1625: function checkContentHash()
95 * 1642: function checkExternalDocContentHash($hashGr,$content_md5h)
96 * 1656: function is_grlist_set($phash_x)
97 * 1669: function update_grlist($phash,$phash_x)
98 * 1684: function updateTstamp($phash,$mtime=0)
99 * 1699: function updateSetId($phash)
100 * 1714: function updateParsetime($phash,$parsetime)
101 * 1727: function updateRootline()
102 * 1742: function getRootLineFields(&$fieldArr)
103 * 1761: function removeLoginpagesWithContentHash()
104 * 1778: function includeCrawlerClass()
105 *
106 * SECTION: SQL; Submitting words
107 * 1805: function checkWordList($wl)
108 * 1842: function submitWords($wl,$phash)
109 * 1866: function freqMap($freq)
110 *
111 * SECTION: Hashing
112 * 1899: function setT3Hashes()
113 * 1925: function setExtHashes($file,$subinfo=array())
114 * 1949: function md5inthash($str)
115 * 1959: function makeCHash($paramArray)
116 *
117 * SECTION: Internal logging functions
118 * 1991: function log_push($msg,$key)
119 * 2000: function log_pull()
120 * 2011: function log_setTSlogMessage($msg, $errorNum=0)
121 *
122 * SECTION: tslib_fe hooks:
123 * 2036: function fe_headerNoCache(&$params, $ref)
124 *
125 * TOTAL FUNCTIONS: 59
126 * (This index is automatically created/updated by the extension "extdeveval")
127 *
128 */
129
130
131 require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
132
133
134 /**
135 * Indexing class for TYPO3 frontend
136 *
137 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
138 * @package TYPO3
139 * @subpackage tx_indexedsearch
140 */
141 class tx_indexedsearch_indexer {
142
143 // Messages:
144 var $reasons = array(
145 -1 => 'mtime matched the document, so no changes detected and no content updated',
146 -2 => 'The minimum age was not exceeded',
147 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
148 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
149 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
150 4 => 'Page has never been indexed (is not represented in the index_phash table).'
151 );
152
153 // HTML code blocks to exclude from indexing:
154 var $excludeSections = 'script,style';
155
156 // Supported Extensions for external files:
157 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
158
159 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
160 var $defaultGrList = '0,-1';
161
162 // Min/Max times:
163 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
164 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
165 var $maxExternalFiles = 0; // Max number of external files to index.
166
167 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc.
168 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
169
170 // INTERNALS:
171 var $defaultContentArray=array(
172 'title' => '',
173 'description' => '',
174 'keywords' => '',
175 'body' => '',
176 );
177 var $wordcount = 0;
178 var $externalFileCounter = 0;
179
180 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
181 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
182 var $hash = array(); // Hash array, contains phash and phash_grouping
183 var $file_phash_arr = array(); // Hash array for files
184 var $contentParts = array(); // Content of TYPO3 page
185 var $content_md5h = '';
186 var $internal_log = array(); // Internal log
187 var $indexExternalUrl_content = '';
188
189 var $cHashParams = array(); // cHashparams array
190
191 var $freqRange = 32000;
192 var $freqMax = 0.1;
193
194 // Objects:
195 /**
196 * Charset class object
197 *
198 * @var t3lib_cs
199 */
200 var $csObj;
201
202 /**
203 * Metaphone object, if any
204 *
205 * @var user_DoubleMetaPhone
206 */
207 var $metaphoneObj;
208
209 /**
210 * Lexer object for word splitting
211 *
212 * @var tx_indexedsearch_lexer
213 */
214 var $lexerObj;
215
216
217
218 /**
219 * Parent Object (TSFE) Initialization
220 *
221 * @param object Parent Object (frontend TSFE object), passed by reference
222 * @return void
223 */
224 function hook_indexContent(&$pObj) {
225
226 // Indexer configuration from Extension Manager interface:
227 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
228
229 // Crawler activation:
230 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
231 if (t3lib_extMgm::isLoaded('crawler')
232 && $pObj->applicationData['tx_crawler']['running']
233 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
234
235 // Setting simple log message:
236 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
237
238 // Setting variables:
239 $this->crawlerActive = TRUE; // Crawler active flag
240 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
241 }
242
243 // Determine if page should be indexed, and if so, configure and initialize indexer
244 if ($pObj->config['config']['index_enable']) {
245 $this->log_push('Index page','');
246
247 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
248 if (!$pObj->page['no_search']) {
249 if (!$pObj->no_cache) {
250 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
251
252 // Setting up internal configuration from config array:
253 $this->conf = array();
254
255 // Information about page for which the indexing takes place
256 $this->conf['id'] = $pObj->id; // Page id
257 $this->conf['type'] = $pObj->type; // Page type
258 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
259 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
260 $this->conf['gr_list'] = $pObj->gr_list; // Group list
261
262 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
263 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
264
265 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
266 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
267
268 // Root line uids
269 $this->conf['rootline_uids'] = array();
270 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
271 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
272 }
273
274 // Content of page:
275 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
276 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
277 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
278 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
279
280 // Configuration of behavior:
281 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
282 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
283
284 // Set to zero:
285 $this->conf['recordUid'] = 0;
286 $this->conf['freeIndexUid'] = 0;
287 $this->conf['freeIndexSetId'] = 0;
288
289 // Init and start indexing:
290 $this->init();
291 $this->indexTypo3PageContent();
292 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
293 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
294 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
295 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
296 $this->log_pull();
297 }
298 }
299
300
301
302
303
304
305
306
307 /****************************
308 *
309 * Backend API
310 *
311 ****************************/
312
313 /**
314 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
315 *
316 * @param integer The page uid, &id=
317 * @param integer The page type, &type=
318 * @param integer sys_language uid, typically &L=
319 * @param string The MP variable (Mount Points), &MP=
320 * @param array Rootline array of only UIDs.
321 * @param array Array of GET variables to register with this indexing
322 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
323 * @return void
324 */
325 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
326
327 // Setting up internal configuration from config array:
328 $this->conf = array();
329
330 // Information about page for which the indexing takes place
331 $this->conf['id'] = $id; // Page id (integer)
332 $this->conf['type'] = $type; // Page type (integer)
333 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
334 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
335 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
336
337 // cHash values:
338 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : ''; // cHash string for additional parameters
339 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
340
341 // Set to defaults
342 $this->conf['freeIndexUid'] = 0;
343 $this->conf['freeIndexSetId'] = 0;
344 $this->conf['page_cache_reg1'] = '';
345
346 // Root line uids
347 $this->conf['rootline_uids'] = $uidRL;
348
349 // Configuration of behavior:
350 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
351 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
352
353 // Init and start indexing:
354 $this->init();
355 }
356
357 /**
358 * Sets the free-index uid. Can be called right after backend_initIndexer()
359 *
360 * @param integer Free index UID
361 * @param integer Set id - an integer identifying the "set" of indexing operations.
362 * @return void
363 */
364 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
365 $this->conf['freeIndexUid'] = $freeIndexUid;
366 $this->conf['freeIndexSetId'] = $freeIndexSetId;
367 }
368
369 /**
370 * Indexing records as the content of a TYPO3 page.
371 *
372 * @param string Title equivalent
373 * @param string Keywords equivalent
374 * @param string Description equivalent
375 * @param string The main content to index
376 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
377 * @param integer Last modification time, in seconds
378 * @param integer The creation date of the content, in seconds
379 * @param integer The record UID that the content comes from (for registration with the indexed rows)
380 * @return void
381 */
382 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
383
384 // Content of page:
385 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
386 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
387 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
388
389 // Construct fake HTML for parsing:
390 $this->conf['content'] = '
391 <html>
392 <head>
393 <title>'.htmlspecialchars($title).'</title>
394 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
395 <meta name="description" content="'.htmlspecialchars($description).'" />
396 </head>
397 <body>
398 '.htmlspecialchars($content).'
399 </body>
400 </html>'; // Content string (HTML of TYPO3 page)
401
402 // Initializing charset:
403 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
404 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
405
406 // Index content as if it was a TYPO3 page:
407 $this->indexTypo3PageContent();
408 }
409
410
411
412
413
414
415
416
417
418
419
420
421
422 /********************************
423 *
424 * Initialization
425 *
426 *******************************/
427
428 /**
429 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
430 *
431 * @return void
432 */
433 function init() {
434 global $TYPO3_CONF_VARS;
435
436 // Initializing:
437 $this->cHashParams = $this->conf['cHash_array'];
438 if (is_array($this->cHashParams) && count($this->cHashParams)) {
439 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
440 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
441 }
442
443 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
444 $this->setT3Hashes();
445
446 // Indexer configuration from Extension Manager interface:
447 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
448 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
449 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
450 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
451 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
452
453 // Initialize external document parsers:
454 // Example configuration, see ext_localconf.php of this file!
455 if ($this->conf['index_externals']) {
456 $this->initializeExternalParsers();
457 }
458
459 // Initialize lexer (class that deconstructs the text into words):
460 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
461 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
462 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
463 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
464 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
465 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
466
467 // Initialize metaphone hook:
468 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
469 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
470 $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
471 $this->metaphoneObj->pObj = &$this;
472 }
473
474 // Init charset class:
475 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
476 }
477
478 /**
479 * Initialize external parsers
480 *
481 * @return void
482 * @access private
483 * @see init()
484 */
485 function initializeExternalParsers() {
486 global $TYPO3_CONF_VARS;
487
488 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
489 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
490 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
491 $this->external_parsers[$extension]->pObj = &$this;
492
493 // Init parser and if it returns false, unset its entry again:
494 if (!$this->external_parsers[$extension]->initParser($extension)) {
495 unset($this->external_parsers[$extension]);
496 }
497 }
498 }
499 }
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515 /********************************
516 *
517 * Indexing; TYPO3 pages (HTML content)
518 *
519 *******************************/
520
521 /**
522 * Start indexing of the TYPO3 page
523 *
524 * @return void
525 */
526 function indexTypo3PageContent() {
527
528 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
529 $is_grlist = $this->is_grlist_set($this->hash['phash']);
530
531 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
532
533 // Setting message:
534 if ($this->forceIndexing) {
535 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
536 } elseif ($check > 0) {
537 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
538 } else {
539 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
540 }
541
542 // Divide into title,keywords,description and body:
543 $this->log_push('Split content','');
544 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
545 if ($this->conf['indexedDocTitle']) {
546 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
547 }
548 $this->log_pull();
549
550 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
551 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
552
553 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
554 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
555 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
556 $checkCHash = $this->checkContentHash();
557 if (!is_array($checkCHash) || $check===1) {
558 $Pstart=t3lib_div::milliseconds();
559
560 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
561 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
562 $this->log_pull();
563
564 // Splitting words
565 $this->log_push('Extract words from content','');
566 $splitInWords = $this->processWordsInArrays($this->contentParts);
567 $this->log_pull();
568
569 // Analyse the indexed words.
570 $this->log_push('Analyse the extracted words','');
571 $indexArr = $this->indexAnalyze($splitInWords);
572 $this->log_pull();
573
574 // Submitting page (phash) record
575 $this->log_push('Submitting page','');
576 $this->submitPage();
577 $this->log_pull();
578
579 // Check words and submit to word list if not there
580 $this->log_push('Check word list and submit words','');
581 $this->checkWordList($indexArr);
582 $this->submitWords($indexArr,$this->hash['phash']);
583 $this->log_pull();
584
585 // Set parsetime
586 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
587
588 // Checking external files if configured for.
589 $this->log_push('Checking external files','');
590 if ($this->conf['index_externals']) {
591 $this->extractLinks($this->conf['content']);
592 }
593 $this->log_pull();
594 } else {
595 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
596 $this->updateSetId($this->hash['phash']);
597 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
598 $this->updateRootline();
599 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
600 }
601 } else {
602 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
603 }
604 }
605
606 /**
607 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
608 *
609 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
610 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
611 * @see splitRegularContent()
612 */
613 function splitHTMLContent($content) {
614
615 // divide head from body ( u-ouh :) )
616 $contentArr = $this->defaultContentArray;
617 $contentArr['body'] = stristr($content,'<body');
618 $headPart = substr($content,0,-strlen($contentArr['body']));
619
620 // get title
621 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
622 $titleParts = explode(':',$contentArr['title'],2);
623 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
624
625 // get keywords and description metatags
626 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
627 for($i=0;isset($meta[$i]);$i++) {
628 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
629 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
630 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
631 }
632
633 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
634 $this->typoSearchTags($contentArr['body']);
635
636 // Get rid of unwanted sections (ie. scripting and style stuff) in body
637 $tagList = explode(',',$this->excludeSections);
638 foreach($tagList as $tag) {
639 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
640 }
641
642 // remove tags, but first make sure we don't concatenate words by doing it
643 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
644 $contentArr['body'] = trim(strip_tags($contentArr['body']));
645
646 $contentArr['keywords'] = trim($contentArr['keywords']);
647 $contentArr['description'] = trim($contentArr['description']);
648
649 // Return array
650 return $contentArr;
651 }
652
653 /**
654 * Extract the charset value from HTML meta tag.
655 *
656 * @param string HTML content
657 * @return string The charset value if found.
658 */
659 function getHTMLcharset($content) {
660 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg)) {
661 if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2)) {
662 return $reg2[1];
663 }
664 }
665 }
666
667 /**
668 * Converts a HTML document to utf-8
669 *
670 * @param string HTML content, any charset
671 * @param string Optional charset (otherwise extracted from HTML)
672 * @return string Converted HTML
673 */
674 function convertHTMLToUtf8($content,$charset='') {
675
676 // Find charset:
677 $charset = $charset ? $charset : $this->getHTMLcharset($content);
678 $charset = $this->csObj->parse_charset($charset);
679
680 // Convert charset:
681 if ($charset && $charset!=='utf-8') {
682 $content = $this->csObj->utf8_encode($content, $charset);
683 }
684 // Convert entities, assuming document is now UTF-8:
685 $content = $this->csObj->entities_to_utf8($content, TRUE);
686
687 return $content;
688 }
689
690 /**
691 * Finds first occurence of embracing tags and returns the embraced content and the original string with
692 * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
693 * <title> of document or removing <script>-sections
694 *
695 * @param string String to search in
696 * @param string Tag name, eg. "script"
697 * @param string Passed by reference: Content inside found tag
698 * @param string Passed by reference: Content after found tag
699 * @param string Passed by reference: Attributes of the found tag.
700 * @return boolean Returns false if tag was not found, otherwise true.
701 */
702 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
703 $endTag = '</'.$tagName.'>';
704 $startTag = '<'.$tagName;
705
706 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
707 if(!$isTagInText) return false; // if the tag was not found, return false
708
709 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
710 $afterTagInText = stristr($isTagInText,$endTag);
711 if ($afterTagInText) {
712 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
713 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
714 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
715 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
716 $tagContent='';
717 $stringAfter = $isTagInText;
718 }
719
720 return true;
721 }
722
723 /**
724 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
725 *
726 * @param string HTML Content, passed by reference
727 * @return boolean Returns true if a TYPOSEARCH_ tag was found, otherwise false.
728 */
729 function typoSearchTags(&$body) {
730 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
731
732 if(count($expBody)>1) {
733 $body = '';
734
735 foreach($expBody as $val) {
736 $part = explode('-->',$val,2);
737 if(trim($part[0])=='begin') {
738 $body.= $part[1];
739 $prev = '';
740 } elseif(trim($part[0])=='end') {
741 $body.= $prev;
742 } else {
743 $prev = $val;
744 }
745 }
746 return true;
747 } else {
748 return false;
749 }
750 }
751
752 /**
753 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
754 *
755 * @param string HTML content
756 * @return void
757 */
758 function extractLinks($content) {
759
760 // Get links:
761 $list = $this->extractHyperLinks($content);
762
763 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
764 $this->includeCrawlerClass();
765 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
766 }
767
768 // Traverse links:
769 foreach($list as $linkInfo) {
770
771 // Decode entities:
772 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
773 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
774 } else {
775 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
776 }
777
778 // Parse URL:
779 $qParts = parse_url($linkSource);
780
781 // Check for jumpurl (TYPO3 specific thing...)
782 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
783 parse_str($qParts['query'],$getP);
784 $linkSource = $getP['jumpurl'];
785 $qParts = parse_url($linkSource); // parse again due to new linkSource!
786 }
787
788 if ($qParts['scheme']) {
789 if ($this->indexerConfig['indexExternalURLs']) {
790 // Index external URL (http or otherwise)
791 $this->indexExternalUrl($linkSource);
792 }
793 } elseif (!$qParts['query']) {
794 if (t3lib_div::isAllowedAbsPath($linkSource)) {
795 $localFile = $linkSource;
796 } else {
797 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
798 }
799 if ($localFile && @is_file($localFile)) {
800
801 // Index local file:
802 if ($linkInfo['localPath']) {
803
804 $fI = pathinfo($linkSource);
805 $ext = strtolower($fI['extension']);
806 if (is_object($crawler)) {
807 $params = array(
808 'document' => $linkSource,
809 'alturl' => $linkInfo['href'],
810 'conf' => $this->conf
811 );
812 unset($params['conf']['content']);
813
814 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
815 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
816 } else {
817 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
818 }
819 } else {
820 if (is_object($crawler)) {
821 $params = array(
822 'document' => $linkSource,
823 'conf' => $this->conf
824 );
825 unset($params['conf']['content']);
826 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
827 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
828 } else {
829 $this->indexRegularDocument($linkSource);
830 }
831 }
832 }
833 }
834 }
835 }
836
837 /**
838 * Extracts all links to external documents from content string.
839 *
840 * @param string Content to analyse
841 * @return array Array of hyperlinks
842 * @see extractLinks()
843 */
844 function extractHyperLinks($string) {
845 if (!is_object($this->htmlParser)) {
846 $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
847 }
848
849 $parts = $this->htmlParser->splitTags('a',$string);
850 $list = array();
851 foreach ($parts as $k => $v) {
852 if ($k%2) {
853 $params = $this->htmlParser->get_tag_attributes($v,1);
854 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
855
856 switch (strtolower($firstTagName)) {
857 case 'a':
858 $src = $params[0]['href'];
859 if ($src) {
860 // Check if a local path to that file has been set - useful if you are using a download script.
861 $md5 = t3lib_div::shortMD5($src);
862 if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'])) {
863 $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';
864 } else $localPath=false;
865
866 $list[] = array(
867 'tag' => $v,
868 'href' => $params[0]['href'],
869 'localPath' => $localPath
870 );
871 }
872 break;
873 }
874 }
875 }
876
877 return $list;
878 }
879
880
881
882
883
884
885
886
887
888
889
890 /******************************************
891 *
892 * Indexing; external URL
893 *
894 ******************************************/
895
896 /**
897 * Index External URLs HTML content
898 *
899 * @param string URL, eg. "http://typo3.org/"
900 * @return void
901 * @see indexRegularDocument()
902 */
903 function indexExternalUrl($externalUrl) {
904
905 // Parse External URL:
906 $qParts = parse_url($externalUrl);
907 $fI = pathinfo($qParts['path']);
908 $ext = strtolower($fI['extension']);
909
910 // Get headers:
911 $urlHeaders = $this->getUrlHeaders($externalUrl);
912 if (stristr($urlHeaders['Content-Type'],'text/html')) {
913 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
914 if (strlen($content)) {
915
916 // Create temporary file:
917 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
918 t3lib_div::writeFile($tmpFile, $content);
919
920 // Index that file:
921 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
922 unlink($tmpFile);
923 }
924 }
925 }
926
927 /**
928 * Getting HTTP request headers of URL
929 *
930 * @param string The URL
931 * @param integer Timeout (seconds?)
932 * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys
933 */
934 function getUrlHeaders($url) {
935 $content = t3lib_div::getURL($url,2); // Try to get the headers only
936
937 if (strlen($content)) {
938 // Compile headers:
939 $headers = t3lib_div::trimExplode(chr(10),$content,1);
940 $retVal = array();
941 foreach($headers as $line) {
942 if (!strlen(trim($line))) {
943 break; // Stop at the first empty line (= end of header)
944 }
945
946 list($headKey, $headValue) = explode(':', $line, 2);
947 $retVal[$headKey] = $headValue;
948 }
949 return $retVal;
950 }
951 }
952
953
954
955
956
957
958
959
960
961
962
963
964
965 /******************************************
966 *
967 * Indexing; external files (PDF, DOC, etc)
968 *
969 ******************************************/
970
971 /**
972 * Indexing a regular document given as $file (relative to PATH_site, local file)
973 *
974 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
975 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
976 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
977 * @param string File extension for temporary file.
978 * @return void
979 */
980 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
981
982 // Init
983 $fI = pathinfo($file);
984 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
985
986 // Create abs-path:
987 if (!$contentTmpFile) {
988 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
989 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
990 } else { // Absolute, pass-through:
991 $absFile = $file;
992 }
993 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
994 } else {
995 $absFile = $contentTmpFile;
996 }
997
998 // Indexing the document:
999 if ($absFile && @is_file($absFile)) {
1000 if ($this->external_parsers[$ext]) {
1001 $mtime = filemtime($absFile);
1002 $cParts = $this->fileContentParts($ext,$absFile);
1003
1004 foreach($cParts as $cPKey) {
1005 $this->internal_log = array();
1006 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
1007 $Pstart = t3lib_div::milliseconds();
1008 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1009 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
1010 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1011 if ($check > 0 || $force) {
1012 if ($check > 0) {
1013 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
1014 } else {
1015 $this->log_setTSlogMessage('Indexing forced by flag',1);
1016 }
1017
1018 // Check external file counter:
1019 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1020
1021 // Divide into title,keywords,description and body:
1022 $this->log_push('Split content','');
1023 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
1024 $this->log_pull();
1025
1026 if (is_array($contentParts)) {
1027 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1028 $content_md5h = $this->md5inthash(implode($contentParts,''));
1029
1030 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1031
1032 // Increment counter:
1033 $this->externalFileCounter++;
1034
1035 // Splitting words
1036 $this->log_push('Extract words from content','');
1037 $splitInWords = $this->processWordsInArrays($contentParts);
1038 $this->log_pull();
1039
1040 // Analyse the indexed words.
1041 $this->log_push('Analyse the extracted words','');
1042 $indexArr = $this->indexAnalyze($splitInWords);
1043 $this->log_pull();
1044
1045 // Submitting page (phash) record
1046 $this->log_push('Submitting page','');
1047 $size = filesize($absFile);
1048 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1049 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
1050 $this->log_pull();
1051
1052 // Check words and submit to word list if not there
1053 $this->log_push('Check word list and submit words','');
1054 $this->checkWordList($indexArr);
1055 $this->submitWords($indexArr,$phash_arr['phash']);
1056 $this->log_pull();
1057
1058 // Set parsetime
1059 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
1060 } else {
1061 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
1062 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
1063 }
1064 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1065 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1066 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1067
1068 // Checking and setting sections:
1069 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
1070 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1071 $this->log_pull();
1072 }
1073 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1074 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1075 }
1076
1077 /**
1078 * Reads the content of an external file being indexed.
1079 * The content from the external parser MUST be returned in utf-8!
1080 *
1081 * @param string File extension, eg. "pdf", "doc" etc.
1082 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1083 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1084 * @return array Standard content array (title, description, keywords, body keys)
1085 */
1086 function readFileContent($ext,$absFile,$cPKey) {
1087
1088 // Consult relevant external document parser:
1089 if (is_object($this->external_parsers[$ext])) {
1090 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
1091 }
1092
1093 return $contentArr;
1094 }
1095
1096 /**
1097 * Creates an array with pointers to divisions of document.
1098 *
1099 * @param string File extension
1100 * @param string Absolute filename (must exist and be validated OK before calling function)
1101 * @return array Array of pointers to sections that the document should be divided into
1102 */
1103 function fileContentParts($ext,$absFile) {
1104 $cParts = array(0);
1105
1106 // Consult relevant external document parser:
1107 if (is_object($this->external_parsers[$ext])) {
1108 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
1109 }
1110
1111 return $cParts;
1112 }
1113
1114 /**
1115 * Splits non-HTML content (from external files for instance)
1116 *
1117 * @param string Input content (non-HTML) to index.
1118 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1119 * @see splitHTMLContent()
1120 */
1121 function splitRegularContent($content) {
1122 $contentArr = $this->defaultContentArray;
1123 $contentArr['body'] = $content;
1124
1125 return $contentArr;
1126 }
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141 /**********************************
1142 *
1143 * Analysing content, Extracting words
1144 *
1145 **********************************/
1146
1147 /**
1148 * Convert character set and HTML entities in the value of input content array keys
1149 *
1150 * @param array Standard content array
1151 * @param string Charset of the input content (converted to utf-8)
1152 * @return void
1153 */
1154 function charsetEntity2utf8(&$contentArr, $charset) {
1155
1156 // Convert charset if necessary
1157 reset($contentArr);
1158 while(list($key,)=each($contentArr)) {
1159 if (strlen($contentArr[$key])) {
1160
1161 if ($charset!=='utf-8') {
1162 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1163 }
1164
1165 // decode all numeric / html-entities in the string to real characters:
1166 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
1167 }
1168 }
1169 }
1170
1171 /**
1172 * Processing words in the array from split*Content -functions
1173 *
1174 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1175 * @return array Content input array modified so each key is not a unique array of words
1176 */
1177 function processWordsInArrays($contentArr) {
1178
1179 // split all parts to words
1180 reset($contentArr);
1181 while(list($key,)=each($contentArr)) {
1182 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1183 }
1184
1185 // For title, keywords, and description we don't want duplicates:
1186 $contentArr['title'] = array_unique($contentArr['title']);
1187 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1188 $contentArr['description'] = array_unique($contentArr['description']);
1189
1190 // Return modified array:
1191 return $contentArr;
1192 }
1193
1194 /**
1195 * Processing words in the array from split*Content -functions
1196 * This function is only a wrapper because the function has been removed (see above).
1197 *
1198 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1199 * @return array Content input array modified so each key is not a unique array of words
1200 * @deprecated
1201 */
1202 function procesWordsInArrays($contentArr) {
1203 return $this->processWordsInArrays($contentArr);
1204 }
1205
1206 /**
1207 * Extracts the sample description text from the content array.
1208 *
1209 * @param array Content array
1210 * @return string Description string
1211 */
1212 function bodyDescription($contentArr) {
1213
1214 // Setting description
1215 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
1216 if ($maxL) {
1217 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
1218 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
1219 $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
1220
1221 // Shorten the string:
1222 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1223 }
1224
1225 return $bodyDescription;
1226 }
1227
1228 /**
1229 * Analyzes content to use for indexing,
1230 *
1231 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1232 * @return array Index Array (whatever that is...)
1233 */
1234 function indexAnalyze($content) {
1235 $indexArr = Array();
1236 $counter = 0;
1237
1238 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
1239 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
1240 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
1241 $this->analyzeBody($indexArr,$content);
1242
1243 return ($indexArr);
1244 }
1245
1246 /**
1247 * Calculates relevant information for headercontent
1248 *
1249 * @param array Index array, passed by reference
1250 * @param array Standard content array
1251 * @param string Key from standard content array
1252 * @param integer Bit-wise priority to type
1253 * @return void
1254 */
1255 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
1256 reset($content[$key]);
1257 while(list(,$val)=each($content[$key])) {
1258 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1259 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
1260 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1261 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1262 $retArr[$val]['metaphone'] = $this->metaphone($val);
1263 $this->wordcount++;
1264 }
1265 }
1266
1267 /**
1268 * Calculates relevant information for bodycontent
1269 *
1270 * @param array Index array, passed by reference
1271 * @param array Standard content array
1272 * @return void
1273 */
1274 function analyzeBody(&$retArr,$content) {
1275 foreach($content['body'] as $key => $val) {
1276 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1277 if(!isset($retArr[$val])) {
1278 $retArr[$val]['first'] = $key;
1279 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1280 $retArr[$val]['metaphone'] = $this->metaphone($val);
1281 }
1282 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1283 $this->wordcount++;
1284 }
1285 }
1286
1287 /**
1288 * Creating metaphone based hash from input word
1289 *
1290 * @param string Word to convert
1291 * @param boolean If set, returns the raw metaphone value (not hashed)
1292 * @return mixed Metaphone hash integer (or raw value, string)
1293 */
1294 function metaphone($word,$retRaw=FALSE) {
1295
1296 if (is_object($this->metaphoneObj)) {
1297 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1298 } else {
1299 $tmp = metaphone($word);
1300 }
1301
1302 // Return raw value?
1303 if ($retRaw) return $tmp;
1304
1305 // Otherwise create hash and return integer
1306 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
1307 return $ret;
1308 }
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325 /********************************
1326 *
1327 * SQL; TYPO3 Pages
1328 *
1329 *******************************/
1330
1331 /**
1332 * Updates db with information about the page (TYPO3 page, not external media)
1333 *
1334 * @return void
1335 */
1336 function submitPage() {
1337
1338 // Remove any current data for this phash:
1339 $this->removeOldIndexedPages($this->hash['phash']);
1340
1341 // setting new phash_row
1342 $fields = array(
1343 'phash' => $this->hash['phash'],
1344 'phash_grouping' => $this->hash['phash_grouping'],
1345 'cHashParams' => serialize($this->cHashParams),
1346 'contentHash' => $this->content_md5h,
1347 'data_page_id' => $this->conf['id'],
1348 'data_page_reg1' => $this->conf['page_cache_reg1'],
1349 'data_page_type' => $this->conf['type'],
1350 'data_page_mp' => $this->conf['MP'],
1351 'gr_list' => $this->conf['gr_list'],
1352 'item_type' => 0, // TYPO3 page
1353 'item_title' => $this->contentParts['title'],
1354 'item_description' => $this->bodyDescription($this->contentParts),
1355 'item_mtime' => $this->conf['mtime'],
1356 'item_size' => strlen($this->conf['content']),
1357 'tstamp' => time(),
1358 'crdate' => time(),
1359 'item_crdate' => $this->conf['crdate'], // Creation date of page
1360 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1361 'externalUrl' => 0,
1362 'recordUid' => intval($this->conf['recordUid']),
1363 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1364 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1365 );
1366
1367 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1368
1369 // PROCESSING index_section
1370 $this->submit_section($this->hash['phash'],$this->hash['phash']);
1371
1372 // PROCESSING index_grlist
1373 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
1374
1375 // PROCESSING index_fulltext
1376 $fields = array(
1377 'phash' => $this->hash['phash'],
1378 'fulltextdata' => implode(' ', $this->contentParts)
1379 );
1380 if ($this->indexerConfig['fullTextDataLength']>0) {
1381 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1382 }
1383 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1384
1385 // PROCESSING index_debug
1386 if ($this->indexerConfig['debugMode']) {
1387 $fields = array(
1388 'phash' => $this->hash['phash'],
1389 'debuginfo' => serialize(array(
1390 'cHashParams' => $this->cHashParams,
1391 'external_parsers initialized' => array_keys($this->external_parsers),
1392 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
1393 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
1394 'logs' => $this->internal_log,
1395 'lexer' => $this->lexerObj->debugString,
1396 ))
1397 );
1398 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1399 }
1400 }
1401
1402 /**
1403 * Stores gr_list in the database.
1404 *
1405 * @param integer Search result record phash
1406 * @param integer Actual phash of current content
1407 * @return void
1408 * @see update_grlist()
1409 */
1410 function submit_grlist($hash,$phash_x) {
1411
1412 // Setting the gr_list record
1413 $fields = array(
1414 'phash' => $hash,
1415 'phash_x' => $phash_x,
1416 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
1417 'gr_list' => $this->conf['gr_list']
1418 );
1419 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1420 }
1421
1422 /**
1423 * Stores section
1424 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1425 *
1426 * @param integer phash of TYPO3 parent search result record
1427 * @param integer phash of the file indexation search record
1428 * @return void
1429 */
1430 function submit_section($hash,$hash_t3) {
1431 $fields = array(
1432 'phash' => $hash,
1433 'phash_t3' => $hash_t3,
1434 'page_id' => intval($this->conf['id'])
1435 );
1436
1437 $this->getRootLineFields($fields);
1438
1439 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1440 }
1441
1442 /**
1443 * Removes records for the indexed page, $phash
1444 *
1445 * @param integer phash value to flush
1446 * @return void
1447 */
1448 function removeOldIndexedPages($phash) {
1449 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1450 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
1451 foreach($tableArr as $table) {
1452 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1453 }
1454 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1455 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
1456 }
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470 /********************************
1471 *
1472 * SQL; External media
1473 *
1474 *******************************/
1475
1476
1477 /**
1478 * Updates db with information about the file
1479 *
1480 * @param array Array with phash and phash_grouping keys for file
1481 * @param string File name
1482 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1483 * @param string File extension determining the type of media.
1484 * @param integer Modification time of file.
1485 * @param integer Creation time of file.
1486 * @param integer Size of file in bytes
1487 * @param integer Content HASH value.
1488 * @param array Standard content array (using only title and body for a file)
1489 * @return void
1490 */
1491 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
1492
1493 // Find item Type:
1494 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1495 $storeItemType = $storeItemType ? $storeItemType : $ext;
1496
1497 // Remove any current data for this phash:
1498 $this->removeOldIndexedFiles($hash['phash']);
1499
1500 // Split filename:
1501 $fileParts = parse_url($file);
1502
1503 // Setting new
1504 $fields = array(
1505 'phash' => $hash['phash'],
1506 'phash_grouping' => $hash['phash_grouping'],
1507 'cHashParams' => serialize($subinfo),
1508 'contentHash' => $content_md5h,
1509 'data_filename' => $file,
1510 'item_type' => $storeItemType,
1511 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1512 'item_description' => $this->bodyDescription($contentParts),
1513 'item_mtime' => $mtime,
1514 'item_size' => $size,
1515 'item_crdate' => $ctime,
1516 'tstamp' => time(),
1517 'crdate' => time(),
1518 'gr_list' => $this->conf['gr_list'],
1519 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1520 'recordUid' => intval($this->conf['recordUid']),
1521 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1522 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1523 );
1524 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1525
1526 // PROCESSING index_fulltext
1527 $fields = array(
1528 'phash' => $hash['phash'],
1529 'fulltextdata' => implode(' ', $contentParts)
1530 );
1531 if ($this->indexerConfig['fullTextDataLength']>0) {
1532 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1533 }
1534 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1535
1536 // PROCESSING index_debug
1537 if ($this->indexerConfig['debugMode']) {
1538 $fields = array(
1539 'phash' => $hash['phash'],
1540 'debuginfo' => serialize(array(
1541 'cHashParams' => $subinfo,
1542 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
1543 'logs' => $this->internal_log,
1544 'lexer' => $this->lexerObj->debugString,
1545 ))
1546 );
1547 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1548 }
1549 }
1550
1551 /**
1552 * Stores file gr_list for a file IF it does not exist already
1553 *
1554 * @param integer phash value of file
1555 * @return void
1556 */
1557 function submitFile_grlist($hash) {
1558 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1559 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
1560 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1561 $this->submit_grlist($hash,$hash);
1562 }
1563 }
1564
1565 /**
1566 * Stores file section for a file IF it does not exist
1567 *
1568 * @param integer phash value of file
1569 * @return void
1570 */
1571 function submitFile_section($hash) {
1572 // Testing if there is a section
1573 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
1574 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1575 $this->submit_section($hash,$this->hash['phash']);
1576 }
1577 }
1578
1579 /**
1580 * Removes records for the indexed page, $phash
1581 *
1582 * @param integer phash value to flush
1583 * @return void
1584 */
1585 function removeOldIndexedFiles($phash) {
1586
1587 // Removing old registrations for tables.
1588 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
1589 foreach($tableArr as $table) {
1590 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1591 }
1592 }
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607 /********************************
1608 *
1609 * SQL Helper functions
1610 *
1611 *******************************/
1612
1613 /**
1614 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1615 * Return positive integer if the page needs to be indexed
1616 *
1617 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1618 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1619 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1620 */
1621 function checkMtimeTstamp($mtime,$phash) {
1622
1623 // Select indexed page:
1624 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
1625 $out = 0;
1626
1627 // If there was an indexing of the page...:
1628 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1629 if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time()) { // If max age is exceeded, index the page
1630 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed.
1631 } else {
1632 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time()) { // if minAge is not set or if minAge is exceeded, consider at mtime
1633 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1634 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index.
1635 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1636 } else {
1637 $out = -1; // mtime matched the document, so no changes detected and no content updated
1638 if ($this->tstamp_maxAge) {
1639 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
1640 } else {
1641 $this->updateTstamp($phash); // Update the timestatmp
1642 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
1643 }
1644 }
1645 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed.
1646 } else {$out = -2;} // The minimum age was not exceeded
1647 }
1648 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table).
1649 return $out;
1650 }
1651
1652 /**
1653 * Check content hash in phash table
1654 *
1655 * @return mixed Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1656 */
1657 function checkContentHash() {
1658 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1659 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
1660 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1661 return $row;
1662 }
1663 return 1;
1664 }
1665
1666 /**
1667 * Check content hash for external documents
1668 * Returns true if the document needs to be indexed (that is, there was no result)
1669 *
1670 * @param integer phash value to check (phash_grouping)
1671 * @param integer Content hash to check
1672 * @return boolean Returns true if the document needs to be indexed (that is, there was no result)
1673 */
1674 function checkExternalDocContentHash($hashGr,$content_md5h) {
1675 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
1676 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1677 return 0;
1678 }
1679 return 1;
1680 }
1681
1682 /**
1683 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1684 *
1685 * @param integer Phash integer to test.
1686 * @return void
1687 */
1688 function is_grlist_set($phash_x) {
1689 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
1690 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
1691 }
1692
1693 /**
1694 * Check if an grlist-entry for this hash exists and if not so, write one.
1695 *
1696 * @param integer phash of the search result that should be found
1697 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1698 * @return void
1699 * @see submit_grlist()
1700 */
1701 function update_grlist($phash,$phash_x) {
1702 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
1703 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1704 $this->submit_grlist($phash,$phash_x);
1705 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
1706 }
1707 }
1708
1709 /**
1710 * Update tstamp for a phash row.
1711 *
1712 * @param integer phash value
1713 * @param integer If set, update the mtime field to this value.
1714 * @return void
1715 */
1716 function updateTstamp($phash,$mtime=0) {
1717 $updateFields = array(
1718 'tstamp' => time()
1719 );
1720 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
1721
1722 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1723 }
1724
1725 /**
1726 * Update SetID of the index_phash record.
1727 *
1728 * @param integer phash value
1729 * @return void
1730 */
1731 function updateSetId($phash) {
1732 $updateFields = array(
1733 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1734 );
1735
1736 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1737 }
1738
1739 /**
1740 * Update parsetime for phash row.
1741 *
1742 * @param integer phash value.
1743 * @param integer Parsetime value to set.
1744 * @return void
1745 */
1746 function updateParsetime($phash,$parsetime) {
1747 $updateFields = array(
1748 'parsetime' => intval($parsetime)
1749 );
1750
1751 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1752 }
1753
1754 /**
1755 * Update section rootline for the page
1756 *
1757 * @return void
1758 */
1759 function updateRootline() {
1760
1761 $updateFields = array();
1762 $this->getRootLineFields($updateFields);
1763
1764 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
1765 }
1766
1767 /**
1768 * Adding values for root-line fields.
1769 * rl0, rl1 and rl2 are standard. A hook might add more.
1770 *
1771 * @param array Field array, passed by reference
1772 * @return void
1773 */
1774 function getRootLineFields(&$fieldArr) {
1775
1776 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
1777 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
1778 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
1779
1780 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1781 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1782 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1783 }
1784 }
1785 }
1786
1787 /**
1788 * Removes any indexed pages with userlogins which has the same contentHash
1789 * NOT USED anywhere inside this class!
1790 *
1791 * @return void
1792 */
1793 function removeLoginpagesWithContentHash() {
1794 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
1795 A.phash=B.phash
1796 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
1797 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
1798 AND A.contentHash='.intval($this->content_md5h));
1799 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1800 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
1801 $this->removeOldIndexedPages($row['phash']);
1802 }
1803 }
1804
1805 /**
1806 * Includes the crawler class
1807 *
1808 * @return void
1809 */
1810 function includeCrawlerClass() {
1811 global $TYPO3_CONF_VARS;
1812
1813 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
1814 }
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825 /********************************
1826 *
1827 * SQL; Submitting words
1828 *
1829 *******************************/
1830
1831 /**
1832 * Adds new words to db
1833 *
1834 * @param array Word List array (where each word has information about position etc).
1835 * @return void
1836 */
1837 function checkWordList($wl) {
1838 reset($wl);
1839 $phashArr = array();
1840 while(list($key,) = each($wl)) {
1841 $phashArr[] = $wl[$key]['hash'];
1842 }
1843 if (count($phashArr)) {
1844 $cwl = implode(',',$phashArr);
1845 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
1846
1847 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
1848 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
1849 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1850 unset($wl[$row['baseword']]);
1851 }
1852
1853 reset($wl);
1854 while(list($key,$val)=each($wl)) {
1855 $insertFields = array(
1856 'wid' => $val['hash'],
1857 'baseword' => $key,
1858 'metaphone' => $val['metaphone']
1859 );
1860 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1861 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1862 }
1863 }
1864 }
1865 }
1866
1867 /**
1868 * Submits RELATIONS between words and phash
1869 *
1870 * @param array Word list array
1871 * @param integer phash value
1872 * @return void
1873 */
1874 function submitWords($wl,$phash) {
1875 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
1876
1877 foreach($wl as $val) {
1878 $insertFields = array(
1879 'phash' => $phash,
1880 'wid' => $val['hash'],
1881 'count' => $val['count'],
1882 'first' => $val['first'],
1883 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
1884 'flags' => ($val['cmp'] & $this->flagBitMask)
1885 );
1886
1887 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
1888 }
1889 }
1890
1891 /**
1892 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1893 * and back.
1894 *
1895 * @param double Frequency
1896 * @return integer Frequency in range.
1897 */
1898 function freqMap($freq) {
1899 $mapFactor = $this->freqMax*100*$this->freqRange;
1900 if($freq<1) {
1901 $newFreq = $freq*$mapFactor;
1902 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
1903 } else {
1904 $newFreq = $freq/$mapFactor;
1905 }
1906 return $newFreq;
1907
1908 }
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920 /********************************
1921 *
1922 * Hashing
1923 *
1924 *******************************/
1925
1926 /**
1927 * Get search hash, T3 pages
1928 *
1929 * @return void
1930 */
1931 function setT3Hashes() {
1932
1933 // Set main array:
1934 $hArray = array(
1935 'id' => (integer)$this->conf['id'],
1936 'type' => (integer)$this->conf['type'],
1937 'sys_lang' => (integer)$this->conf['sys_language_uid'],
1938 'MP' => (string)$this->conf['MP'],
1939 'cHash' => $this->cHashParams
1940 );
1941
1942 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1943 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1944
1945 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1946 $hArray['gr_list'] = (string)$this->conf['gr_list'];
1947 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
1948 }
1949
1950 /**
1951 * Get search hash, external files
1952 *
1953 * @param string File name / path which identifies it on the server
1954 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
1955 * @return array Array with "phash_grouping" and "phash" inside.
1956 */
1957 function setExtHashes($file,$subinfo=array()) {
1958 // Set main array:
1959 $hash = array();
1960 $hArray = array(
1961 'file' => $file,
1962 );
1963
1964 // Set grouping hash:
1965 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1966
1967 // Add subinfo
1968 $hArray['subinfo'] = $subinfo;
1969 $hash['phash'] = $this->md5inthash(serialize($hArray));
1970
1971 return $hash;
1972 }
1973
1974 /**
1975 * md5 integer hash
1976 * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
1977 *
1978 * @param string String to hash
1979 * @return integer Integer intepretation of the md5 hash of input string.
1980 */
1981 function md5inthash($str) {
1982 return hexdec(substr(md5($str),0,7));
1983 }
1984
1985 /**
1986 * Calculates the cHash value of input GET array (for constructing cHash values if needed)
1987 *
1988 * @param array Array of GET parameters to encode
1989 * @return void
1990 */
1991 function makeCHash($paramArray) {
1992 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
1993
1994 $pA = t3lib_div::cHashParams($addQueryParams);
1995
1996 return t3lib_div::shortMD5(serialize($pA));
1997 }
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010 /*********************************
2011 *
2012 * Internal logging functions
2013 *
2014 *********************************/
2015
2016 /**
2017 * Push function wrapper for TT logging
2018 *
2019 * @param string Title to set
2020 * @param string Key (?)
2021 * @return void
2022 */
2023 function log_push($msg,$key) {
2024 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
2025 }
2026
2027 /**
2028 * Pull function wrapper for TT logging
2029 *
2030 * @return void
2031 */
2032 function log_pull() {
2033 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
2034 }
2035
2036 /**
2037 * Set log message function wrapper for TT logging
2038 *
2039 * @param string Message to set
2040 * @param integer Error number
2041 * @return void
2042 */
2043 function log_setTSlogMessage($msg, $errorNum=0) {
2044 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
2045 $this->internal_log[] = $msg;
2046 }
2047
2048
2049
2050
2051
2052
2053
2054
2055 /**************************
2056 *
2057 * tslib_fe hooks:
2058 *
2059 **************************/
2060
2061 /**
2062 * Frontend hook: If the page is not being re-generated this is our chance to force it to be (because re-generation of the page is required in order to have the indexer called!)
2063 *
2064 * @param array Parameters from frontend
2065 * @param object TSFE object (reference under PHP5)
2066 * @return void
2067 */
2068 function fe_headerNoCache(&$params, $ref) {
2069
2070 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
2071 if (t3lib_extMgm::isLoaded('crawler')
2072 && $params['pObj']->applicationData['tx_crawler']['running']
2073 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions'])) {
2074
2075 // Setting simple log entry:
2076 $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData'];
2077
2078 // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached.
2079 $params['disableAcquireCacheData'] = TRUE;
2080 }
2081 }
2082 }
2083
2084
2085 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
2086 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
2087 }
2088 ?>