* Merging all changes from TYPO3_4-0 branch back into HEAD
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 141: class tx_indexedsearch_indexer
39 * 207: function hook_indexContent(&$pObj)
40 *
41 * SECTION: Backend API
42 * 308: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
43 * 347: function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
44 * 365: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
45 *
46 * SECTION: Initialization
47 * 416: function init()
48 * 468: function initializeExternalParsers()
49 *
50 * SECTION: Indexing; TYPO3 pages (HTML content)
51 * 509: function indexTypo3PageContent()
52 * 596: function splitHTMLContent($content)
53 * 642: function getHTMLcharset($content)
54 * 657: function convertHTMLToUtf8($content,$charset='')
55 * 685: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
56 * 712: function typoSearchTags(&$body)
57 * 741: function extractLinks($content)
58 * 812: function extractHyperLinks($string)
59 *
60 * SECTION: Indexing; external URL
61 * 871: function indexExternalUrl($externalUrl)
62 * 902: function getUrlHeaders($url)
63 *
64 * SECTION: Indexing; external files (PDF, DOC, etc)
65 * 948: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
66 * 1054: function readFileContent($ext,$absFile,$cPKey)
67 * 1071: function fileContentParts($ext,$absFile)
68 * 1089: function splitRegularContent($content)
69 *
70 * SECTION: Analysing content, Extracting words
71 * 1122: function charsetEntity2utf8(&$contentArr, $charset)
72 * 1145: function processWordsInArrays($contentArr)
73 * 1170: function procesWordsInArrays($contentArr)
74 * 1180: function bodyDescription($contentArr)
75 * 1202: function indexAnalyze($content)
76 * 1223: function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
77 * 1242: function analyzeBody(&$retArr,$content)
78 * 1262: function metaphone($word,$retRaw=FALSE)
79 *
80 * SECTION: SQL; TYPO3 Pages
81 * 1304: function submitPage()
82 * 1378: function submit_grlist($hash,$phash_x)
83 * 1398: function submit_section($hash,$hash_t3)
84 * 1416: function removeOldIndexedPages($phash)
85 *
86 * SECTION: SQL; External media
87 * 1459: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
88 * 1525: function submitFile_grlist($hash)
89 * 1539: function submitFile_section($hash)
90 * 1553: function removeOldIndexedFiles($phash)
91 *
92 * SECTION: SQL Helper functions
93 * 1589: function checkMtimeTstamp($mtime,$phash)
94 * 1625: function checkContentHash()
95 * 1642: function checkExternalDocContentHash($hashGr,$content_md5h)
96 * 1656: function is_grlist_set($phash_x)
97 * 1669: function update_grlist($phash,$phash_x)
98 * 1684: function updateTstamp($phash,$mtime=0)
99 * 1699: function updateSetId($phash)
100 * 1714: function updateParsetime($phash,$parsetime)
101 * 1727: function updateRootline()
102 * 1742: function getRootLineFields(&$fieldArr)
103 * 1761: function removeLoginpagesWithContentHash()
104 * 1778: function includeCrawlerClass()
105 *
106 * SECTION: SQL; Submitting words
107 * 1805: function checkWordList($wl)
108 * 1842: function submitWords($wl,$phash)
109 * 1866: function freqMap($freq)
110 *
111 * SECTION: Hashing
112 * 1899: function setT3Hashes()
113 * 1925: function setExtHashes($file,$subinfo=array())
114 * 1949: function md5inthash($str)
115 * 1959: function makeCHash($paramArray)
116 *
117 * SECTION: Internal logging functions
118 * 1991: function log_push($msg,$key)
119 * 2000: function log_pull()
120 * 2011: function log_setTSlogMessage($msg, $errorNum=0)
121 *
122 * SECTION: tslib_fe hooks:
123 * 2036: function fe_headerNoCache(&$params, $ref)
124 *
125 * TOTAL FUNCTIONS: 59
126 * (This index is automatically created/updated by the extension "extdeveval")
127 *
128 */
129
130
131 require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
132
133
134 /**
135 * Indexing class for TYPO3 frontend
136 *
137 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
138 * @package TYPO3
139 * @subpackage tx_indexedsearch
140 */
141 class tx_indexedsearch_indexer {
142
143 // Messages:
144 var $reasons = array(
145 -1 => 'mtime matched the document, so no changes detected and no content updated',
146 -2 => 'The minimum age was not exceeded',
147 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
148 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
149 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
150 4 => 'Page has never been indexed (is not represented in the index_phash table).'
151 );
152
153 // HTML code blocks to exclude from indexing:
154 var $excludeSections = 'script,style';
155
156 // Supported Extensions for external files:
157 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
158
159 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
160 var $defaultGrList = '0,-1';
161
162 // Min/Max times:
163 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
164 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
165 var $maxExternalFiles = 0; // Max number of external files to index.
166
167 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc.
168 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
169
170 // INTERNALS:
171 var $defaultContentArray=array(
172 'title' => '',
173 'description' => '',
174 'keywords' => '',
175 'body' => '',
176 );
177 var $wordcount = 0;
178 var $externalFileCounter = 0;
179
180 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
181 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
182 var $hash = array(); // Hash array, contains phash and phash_grouping
183 var $file_phash_arr = array(); // Hash array for files
184 var $contentParts = array(); // Content of TYPO3 page
185 var $content_md5h = '';
186 var $internal_log = array(); // Internal log
187 var $indexExternalUrl_content = '';
188
189 var $cHashParams = array(); // cHashparams array
190
191 var $freqRange = 32000;
192 var $freqMax = 0.1;
193
194 // Objects:
195 var $csObj; // Charset class object , t3lib_cs
196 var $metaphoneObj; // Metaphone object, if any
197 var $lexerObj; // Lexer object for word splitting
198
199
200
201 /**
202 * Parent Object (TSFE) Initialization
203 *
204 * @param object Parent Object (frontend TSFE object), passed by reference
205 * @return void
206 */
207 function hook_indexContent(&$pObj) {
208
209 // Indexer configuration from Extension Manager interface:
210 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
211
212 // Crawler activation:
213 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
214 if (t3lib_extMgm::isLoaded('crawler')
215 && $pObj->applicationData['tx_crawler']['running']
216 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
217
218 // Setting simple log message:
219 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
220
221 // Setting variables:
222 $this->crawlerActive = TRUE; // Crawler active flag
223 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
224 }
225
226 // Determine if page should be indexed, and if so, configure and initialize indexer
227 if ($pObj->config['config']['index_enable']) {
228 $this->log_push('Index page','');
229
230 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
231 if (!$pObj->page['no_search']) {
232 if (!$pObj->no_cache) {
233 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
234
235 // Setting up internal configuration from config array:
236 $this->conf = array();
237
238 // Information about page for which the indexing takes place
239 $this->conf['id'] = $pObj->id; // Page id
240 $this->conf['type'] = $pObj->type; // Page type
241 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
242 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
243 $this->conf['gr_list'] = $pObj->gr_list; // Group list
244
245 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
246 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
247
248 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
249 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
250
251 // Root line uids
252 $this->conf['rootline_uids'] = array();
253 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
254 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
255 }
256
257 // Content of page:
258 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
259 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
260 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
261 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
262
263 // Configuration of behavior:
264 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
265 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
266
267 // Set to zero:
268 $this->conf['recordUid'] = 0;
269 $this->conf['freeIndexUid'] = 0;
270 $this->conf['freeIndexSetId'] = 0;
271
272 // Init and start indexing:
273 $this->init();
274 $this->indexTypo3PageContent();
275 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
276 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
277 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
278 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
279 $this->log_pull();
280 }
281 }
282
283
284
285
286
287
288
289
290 /****************************
291 *
292 * Backend API
293 *
294 ****************************/
295
296 /**
297 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
298 *
299 * @param integer The page uid, &id=
300 * @param integer The page type, &type=
301 * @param integer sys_language uid, typically &L=
302 * @param string The MP variable (Mount Points), &MP=
303 * @param array Rootline array of only UIDs.
304 * @param array Array of GET variables to register with this indexing
305 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
306 * @return void
307 */
308 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
309
310 // Setting up internal configuration from config array:
311 $this->conf = array();
312
313 // Information about page for which the indexing takes place
314 $this->conf['id'] = $id; // Page id (integer)
315 $this->conf['type'] = $type; // Page type (integer)
316 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
317 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
318 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
319
320 // cHash values:
321 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : ''; // cHash string for additional parameters
322 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
323
324 // Set to defaults
325 $this->conf['freeIndexUid'] = 0;
326 $this->conf['freeIndexSetId'] = 0;
327 $this->conf['page_cache_reg1'] = '';
328
329 // Root line uids
330 $this->conf['rootline_uids'] = $uidRL;
331
332 // Configuration of behavior:
333 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
334 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
335
336 // Init and start indexing:
337 $this->init();
338 }
339
340 /**
341 * Sets the free-index uid. Can be called right after backend_initIndexer()
342 *
343 * @param integer Free index UID
344 * @param integer Set id - an integer identifying the "set" of indexing operations.
345 * @return void
346 */
347 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
348 $this->conf['freeIndexUid'] = $freeIndexUid;
349 $this->conf['freeIndexSetId'] = $freeIndexSetId;
350 }
351
352 /**
353 * Indexing records as the content of a TYPO3 page.
354 *
355 * @param string Title equivalent
356 * @param string Keywords equivalent
357 * @param string Description equivalent
358 * @param string The main content to index
359 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
360 * @param integer Last modification time, in seconds
361 * @param integer The creation date of the content, in seconds
362 * @param integer The record UID that the content comes from (for registration with the indexed rows)
363 * @return void
364 */
365 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
366
367 // Content of page:
368 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
369 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
370 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
371
372 // Construct fake HTML for parsing:
373 $this->conf['content'] = '
374 <html>
375 <head>
376 <title>'.htmlspecialchars($title).'</title>
377 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
378 <meta name="description" content="'.htmlspecialchars($description).'" />
379 </head>
380 <body>
381 '.htmlspecialchars($content).'
382 </body>
383 </html>'; // Content string (HTML of TYPO3 page)
384
385 // Initializing charset:
386 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
387 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
388
389 // Index content as if it was a TYPO3 page:
390 $this->indexTypo3PageContent();
391 }
392
393
394
395
396
397
398
399
400
401
402
403
404
405 /********************************
406 *
407 * Initialization
408 *
409 *******************************/
410
411 /**
412 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
413 *
414 * @return void
415 */
416 function init() {
417 global $TYPO3_CONF_VARS;
418
419 // Initializing:
420 $this->cHashParams = $this->conf['cHash_array'];
421 if (is_array($this->cHashParams) && count($this->cHashParams)) {
422 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
423 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
424 }
425
426 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
427 $this->setT3Hashes();
428
429 // Indexer configuration from Extension Manager interface:
430 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
431 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
432 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
433 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
434 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
435
436 // Initialize external document parsers:
437 // Example configuration, see ext_localconf.php of this file!
438 if ($this->conf['index_externals']) {
439 $this->initializeExternalParsers();
440 }
441
442 // Initialize lexer (class that deconstructs the text into words):
443 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
444 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
445 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
446 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
447 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
448 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
449
450 // Initialize metaphone hook:
451 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
452 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
453 $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
454 $this->metaphoneObj->pObj = &$this;
455 }
456
457 // Init charset class:
458 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
459 }
460
461 /**
462 * Initialize external parsers
463 *
464 * @return void
465 * @access private
466 * @see init()
467 */
468 function initializeExternalParsers() {
469 global $TYPO3_CONF_VARS;
470
471 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
472 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
473 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
474 $this->external_parsers[$extension]->pObj = &$this;
475
476 // Init parser and if it returns false, unset its entry again:
477 if (!$this->external_parsers[$extension]->initParser($extension)) {
478 unset($this->external_parsers[$extension]);
479 }
480 }
481 }
482 }
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498 /********************************
499 *
500 * Indexing; TYPO3 pages (HTML content)
501 *
502 *******************************/
503
504 /**
505 * Start indexing of the TYPO3 page
506 *
507 * @return void
508 */
509 function indexTypo3PageContent() {
510
511 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
512 $is_grlist = $this->is_grlist_set($this->hash['phash']);
513
514 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
515
516 // Setting message:
517 if ($this->forceIndexing) {
518 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
519 } elseif ($check > 0) {
520 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
521 } else {
522 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
523 }
524
525 // Divide into title,keywords,description and body:
526 $this->log_push('Split content','');
527 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
528 if ($this->conf['indexedDocTitle']) {
529 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
530 }
531 $this->log_pull();
532
533 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
534 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
535
536 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
537 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
538 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
539 $checkCHash = $this->checkContentHash();
540 if (!is_array($checkCHash) || $check===1) {
541 $Pstart=t3lib_div::milliseconds();
542
543 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
544 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
545 $this->log_pull();
546
547 // Splitting words
548 $this->log_push('Extract words from content','');
549 $splitInWords = $this->processWordsInArrays($this->contentParts);
550 $this->log_pull();
551
552 // Analyse the indexed words.
553 $this->log_push('Analyse the extracted words','');
554 $indexArr = $this->indexAnalyze($splitInWords);
555 $this->log_pull();
556
557 // Submitting page (phash) record
558 $this->log_push('Submitting page','');
559 $this->submitPage();
560 $this->log_pull();
561
562 // Check words and submit to word list if not there
563 $this->log_push('Check word list and submit words','');
564 $this->checkWordList($indexArr);
565 $this->submitWords($indexArr,$this->hash['phash']);
566 $this->log_pull();
567
568 // Set parsetime
569 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
570
571 // Checking external files if configured for.
572 $this->log_push('Checking external files','');
573 if ($this->conf['index_externals']) {
574 $this->extractLinks($this->conf['content']);
575 }
576 $this->log_pull();
577 } else {
578 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
579 $this->updateSetId($this->hash['phash']);
580 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
581 $this->updateRootline();
582 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
583 }
584 } else {
585 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
586 }
587 }
588
589 /**
590 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
591 *
592 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
593 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
594 * @see splitRegularContent()
595 */
596 function splitHTMLContent($content) {
597
598 // divide head from body ( u-ouh :) )
599 $contentArr = $this->defaultContentArray;
600 $contentArr['body'] = stristr($content,'<body');
601 $headPart = substr($content,0,-strlen($contentArr['body']));
602
603 // get title
604 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
605 $titleParts = explode(':',$contentArr['title'],2);
606 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
607
608 // get keywords and description metatags
609 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
610 for($i=0;isset($meta[$i]);$i++) {
611 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
612 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
613 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
614 }
615
616 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
617 $this->typoSearchTags($contentArr['body']);
618
619 // Get rid of unwanted sections (ie. scripting and style stuff) in body
620 $tagList = explode(',',$this->excludeSections);
621 foreach($tagList as $tag) {
622 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
623 }
624
625 // remove tags, but first make sure we don't concatenate words by doing it
626 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
627 $contentArr['body'] = trim(strip_tags($contentArr['body']));
628
629 $contentArr['keywords'] = trim($contentArr['keywords']);
630 $contentArr['description'] = trim($contentArr['description']);
631
632 // Return array
633 return $contentArr;
634 }
635
636 /**
637 * Extract the charset value from HTML meta tag.
638 *
639 * @param string HTML content
640 * @return string The charset value if found.
641 */
642 function getHTMLcharset($content) {
643 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg)) {
644 if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2)) {
645 return $reg2[1];
646 }
647 }
648 }
649
650 /**
651 * Converts a HTML document to utf-8
652 *
653 * @param string HTML content, any charset
654 * @param string Optional charset (otherwise extracted from HTML)
655 * @return string Converted HTML
656 */
657 function convertHTMLToUtf8($content,$charset='') {
658
659 // Find charset:
660 $charset = $charset ? $charset : $this->getHTMLcharset($content);
661 $charset = $this->csObj->parse_charset($charset);
662
663 // Convert charset:
664 if ($charset && $charset!=='utf-8') {
665 $content = $this->csObj->utf8_encode($content, $charset);
666 }
667 // Convert entities, assuming document is now UTF-8:
668 $content = $this->csObj->entities_to_utf8($content, TRUE);
669
670 return $content;
671 }
672
673 /**
674 * Finds first occurence of embracing tags and returns the embraced content and the original string with
675 * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
676 * <title> of document or removing <script>-sections
677 *
678 * @param string String to search in
679 * @param string Tag name, eg. "script"
680 * @param string Passed by reference: Content inside found tag
681 * @param string Passed by reference: Content after found tag
682 * @param string Passed by reference: Attributes of the found tag.
683 * @return boolean Returns false if tag was not found, otherwise true.
684 */
685 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
686 $endTag = '</'.$tagName.'>';
687 $startTag = '<'.$tagName;
688
689 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
690 if(!$isTagInText) return false; // if the tag was not found, return false
691
692 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
693 $afterTagInText = stristr($isTagInText,$endTag);
694 if ($afterTagInText) {
695 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
696 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
697 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
698 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
699 $tagContent='';
700 $stringAfter = $isTagInText;
701 }
702
703 return true;
704 }
705
706 /**
707 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
708 *
709 * @param string HTML Content, passed by reference
710 * @return boolean Returns true if a TYPOSEARCH_ tag was found, otherwise false.
711 */
712 function typoSearchTags(&$body) {
713 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
714
715 if(count($expBody)>1) {
716 $body = '';
717
718 foreach($expBody as $val) {
719 $part = explode('-->',$val,2);
720 if(trim($part[0])=='begin') {
721 $body.= $part[1];
722 $prev = '';
723 } elseif(trim($part[0])=='end') {
724 $body.= $prev;
725 } else {
726 $prev = $val;
727 }
728 }
729 return true;
730 } else {
731 return false;
732 }
733 }
734
735 /**
736 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
737 *
738 * @param string HTML content
739 * @return void
740 */
741 function extractLinks($content) {
742
743 // Get links:
744 $list = $this->extractHyperLinks($content);
745
746 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
747 $this->includeCrawlerClass();
748 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
749 }
750
751 // Traverse links:
752 foreach($list as $linkInfo) {
753
754 // Decode entities:
755 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
756 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
757 } else {
758 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
759 }
760
761 // Parse URL:
762 $qParts = parse_url($linkSource);
763
764 // Check for jumpurl (TYPO3 specific thing...)
765 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
766 parse_str($qParts['query'],$getP);
767 $linkSource = $getP['jumpurl'];
768 $qParts = parse_url($linkSource); // parse again due to new linkSource!
769 }
770
771 if ($qParts['scheme']) {
772 if ($this->indexerConfig['indexExternalURLs']) {
773 // Index external URL (http or otherwise)
774 $this->indexExternalUrl($linkSource);
775 }
776 } elseif (!$qParts['query']) {
777 if (t3lib_div::isAllowedAbsPath($linkSource)) {
778 $localFile = $linkSource;
779 } else {
780 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
781 }
782 if ($localFile && @is_file($localFile)) {
783
784 // Index local file:
785 if ($linkInfo['localPath']) {
786
787 $fI = pathinfo($linkSource);
788 $ext = strtolower($fI['extension']);
789 if (is_object($crawler)) {
790 $params = array(
791 'document' => $linkSource,
792 'alturl' => $linkInfo['href'],
793 'conf' => $this->conf
794 );
795 unset($params['conf']['content']);
796
797 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
798 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
799 } else {
800 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
801 }
802 } else {
803 if (is_object($crawler)) {
804 $params = array(
805 'document' => $linkSource,
806 'conf' => $this->conf
807 );
808 unset($params['conf']['content']);
809 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
810 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
811 } else {
812 $this->indexRegularDocument($linkSource);
813 }
814 }
815 }
816 }
817 }
818 }
819
820 /**
821 * Extracts all links to external documents from content string.
822 *
823 * @param string Content to analyse
824 * @return array Array of hyperlinks
825 * @see extractLinks()
826 */
827 function extractHyperLinks($string) {
828 if (!is_object($this->htmlParser)) {
829 $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
830 }
831
832 $parts = $this->htmlParser->splitTags('a',$string);
833 $list = array();
834 foreach ($parts as $k => $v) {
835 if ($k%2) {
836 $params = $this->htmlParser->get_tag_attributes($v,1);
837 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
838
839 switch (strtolower($firstTagName)) {
840 case 'a':
841 $src = $params[0]['href'];
842 if ($src) {
843 // Check if a local path to that file has been set - useful if you are using a download script.
844 $md5 = t3lib_div::shortMD5($src);
845 if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'])) {
846 $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';
847 } else $localPath=false;
848
849 $list[] = array(
850 'tag' => $v,
851 'href' => $params[0]['href'],
852 'localPath' => $localPath
853 );
854 }
855 break;
856 }
857 }
858 }
859
860 return $list;
861 }
862
863
864
865
866
867
868
869
870
871
872
873 /******************************************
874 *
875 * Indexing; external URL
876 *
877 ******************************************/
878
879 /**
880 * Index External URLs HTML content
881 *
882 * @param string URL, eg. "http://typo3.org/"
883 * @return void
884 * @see indexRegularDocument()
885 */
886 function indexExternalUrl($externalUrl) {
887
888 // Parse External URL:
889 $qParts = parse_url($externalUrl);
890 $fI = pathinfo($qParts['path']);
891 $ext = strtolower($fI['extension']);
892
893 // Get headers:
894 $urlHeaders = $this->getUrlHeaders($externalUrl);
895 if (stristr($urlHeaders['Content-Type'],'text/html')) {
896 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
897 if (strlen($content)) {
898
899 // Create temporary file:
900 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
901 t3lib_div::writeFile($tmpFile, $content);
902
903 // Index that file:
904 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
905 unlink($tmpFile);
906 }
907 }
908 }
909
910 /**
911 * Getting HTTP request headers of URL
912 *
913 * @param string The URL
914 * @param integer Timeout (seconds?)
915 * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys
916 */
917 function getUrlHeaders($url) {
918 $content = t3lib_div::getURL($url,2); // Try to get the headers only
919
920 if (strlen($content)) {
921 // Compile headers:
922 $headers = t3lib_div::trimExplode(chr(10),$content,1);
923 $retVal = array();
924 foreach($headers as $line) {
925 if (!strlen(trim($line))) {
926 break; // Stop at the first empty line (= end of header)
927 }
928
929 list($headKey, $headValue) = explode(':', $line, 2);
930 $retVal[$headKey] = $headValue;
931 }
932 return $retVal;
933 }
934 }
935
936
937
938
939
940
941
942
943
944
945
946
947
948 /******************************************
949 *
950 * Indexing; external files (PDF, DOC, etc)
951 *
952 ******************************************/
953
954 /**
955 * Indexing a regular document given as $file (relative to PATH_site, local file)
956 *
957 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
958 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
959 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
960 * @param string File extension for temporary file.
961 * @return void
962 */
963 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
964
965 // Init
966 $fI = pathinfo($file);
967 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
968
969 // Create abs-path:
970 if (!$contentTmpFile) {
971 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
972 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
973 } else { // Absolute, pass-through:
974 $absFile = $file;
975 }
976 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
977 } else {
978 $absFile = $contentTmpFile;
979 }
980
981 // Indexing the document:
982 if ($absFile && @is_file($absFile)) {
983 if ($this->external_parsers[$ext]) {
984 $mtime = filemtime($absFile);
985 $cParts = $this->fileContentParts($ext,$absFile);
986
987 foreach($cParts as $cPKey) {
988 $this->internal_log = array();
989 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
990 $Pstart = t3lib_div::milliseconds();
991 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
992 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
993 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
994 if ($check > 0 || $force) {
995 if ($check > 0) {
996 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
997 } else {
998 $this->log_setTSlogMessage('Indexing forced by flag',1);
999 }
1000
1001 // Check external file counter:
1002 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1003
1004 // Divide into title,keywords,description and body:
1005 $this->log_push('Split content','');
1006 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
1007 $this->log_pull();
1008
1009 if (is_array($contentParts)) {
1010 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1011 $content_md5h = $this->md5inthash(implode($contentParts,''));
1012
1013 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1014
1015 // Increment counter:
1016 $this->externalFileCounter++;
1017
1018 // Splitting words
1019 $this->log_push('Extract words from content','');
1020 $splitInWords = $this->processWordsInArrays($contentParts);
1021 $this->log_pull();
1022
1023 // Analyse the indexed words.
1024 $this->log_push('Analyse the extracted words','');
1025 $indexArr = $this->indexAnalyze($splitInWords);
1026 $this->log_pull();
1027
1028 // Submitting page (phash) record
1029 $this->log_push('Submitting page','');
1030 $size = filesize($absFile);
1031 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1032 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
1033 $this->log_pull();
1034
1035 // Check words and submit to word list if not there
1036 $this->log_push('Check word list and submit words','');
1037 $this->checkWordList($indexArr);
1038 $this->submitWords($indexArr,$phash_arr['phash']);
1039 $this->log_pull();
1040
1041 // Set parsetime
1042 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
1043 } else {
1044 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
1045 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
1046 }
1047 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1048 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1049 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1050
1051 // Checking and setting sections:
1052 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
1053 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1054 $this->log_pull();
1055 }
1056 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1057 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1058 }
1059
1060 /**
1061 * Reads the content of an external file being indexed.
1062 * The content from the external parser MUST be returned in utf-8!
1063 *
1064 * @param string File extension, eg. "pdf", "doc" etc.
1065 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1066 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1067 * @return array Standard content array (title, description, keywords, body keys)
1068 */
1069 function readFileContent($ext,$absFile,$cPKey) {
1070
1071 // Consult relevant external document parser:
1072 if (is_object($this->external_parsers[$ext])) {
1073 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
1074 }
1075
1076 return $contentArr;
1077 }
1078
1079 /**
1080 * Creates an array with pointers to divisions of document.
1081 *
1082 * @param string File extension
1083 * @param string Absolute filename (must exist and be validated OK before calling function)
1084 * @return array Array of pointers to sections that the document should be divided into
1085 */
1086 function fileContentParts($ext,$absFile) {
1087 $cParts = array(0);
1088
1089 // Consult relevant external document parser:
1090 if (is_object($this->external_parsers[$ext])) {
1091 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
1092 }
1093
1094 return $cParts;
1095 }
1096
1097 /**
1098 * Splits non-HTML content (from external files for instance)
1099 *
1100 * @param string Input content (non-HTML) to index.
1101 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1102 * @see splitHTMLContent()
1103 */
1104 function splitRegularContent($content) {
1105 $contentArr = $this->defaultContentArray;
1106 $contentArr['body'] = $content;
1107
1108 return $contentArr;
1109 }
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124 /**********************************
1125 *
1126 * Analysing content, Extracting words
1127 *
1128 **********************************/
1129
1130 /**
1131 * Convert character set and HTML entities in the value of input content array keys
1132 *
1133 * @param array Standard content array
1134 * @param string Charset of the input content (converted to utf-8)
1135 * @return void
1136 */
1137 function charsetEntity2utf8(&$contentArr, $charset) {
1138
1139 // Convert charset if necessary
1140 reset($contentArr);
1141 while(list($key,)=each($contentArr)) {
1142 if (strlen($contentArr[$key])) {
1143
1144 if ($charset!=='utf-8') {
1145 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1146 }
1147
1148 // decode all numeric / html-entities in the string to real characters:
1149 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
1150 }
1151 }
1152 }
1153
1154 /**
1155 * Processing words in the array from split*Content -functions
1156 *
1157 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1158 * @return array Content input array modified so each key is not a unique array of words
1159 */
1160 function processWordsInArrays($contentArr) {
1161
1162 // split all parts to words
1163 reset($contentArr);
1164 while(list($key,)=each($contentArr)) {
1165 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1166 }
1167
1168 // For title, keywords, and description we don't want duplicates:
1169 $contentArr['title'] = array_unique($contentArr['title']);
1170 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1171 $contentArr['description'] = array_unique($contentArr['description']);
1172
1173 // Return modified array:
1174 return $contentArr;
1175 }
1176
1177 /**
1178 * Processing words in the array from split*Content -functions
1179 * This function is only a wrapper because the function has been removed (see above).
1180 *
1181 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1182 * @return array Content input array modified so each key is not a unique array of words
1183 * @deprecated
1184 */
1185 function procesWordsInArrays($contentArr) {
1186 return $this->processWordsInArrays($contentArr);
1187 }
1188
1189 /**
1190 * Extracts the sample description text from the content array.
1191 *
1192 * @param array Content array
1193 * @return string Description string
1194 */
1195 function bodyDescription($contentArr) {
1196
1197 // Setting description
1198 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
1199 if ($maxL) {
1200 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
1201 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
1202 $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
1203
1204 // Shorten the string:
1205 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1206 }
1207
1208 return $bodyDescription;
1209 }
1210
1211 /**
1212 * Analyzes content to use for indexing,
1213 *
1214 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1215 * @return array Index Array (whatever that is...)
1216 */
1217 function indexAnalyze($content) {
1218 $indexArr = Array();
1219 $counter = 0;
1220
1221 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
1222 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
1223 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
1224 $this->analyzeBody($indexArr,$content);
1225
1226 return ($indexArr);
1227 }
1228
1229 /**
1230 * Calculates relevant information for headercontent
1231 *
1232 * @param array Index array, passed by reference
1233 * @param array Standard content array
1234 * @param string Key from standard content array
1235 * @param integer Bit-wise priority to type
1236 * @return void
1237 */
1238 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
1239 reset($content[$key]);
1240 while(list(,$val)=each($content[$key])) {
1241 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1242 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
1243 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1244 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1245 $retArr[$val]['metaphone'] = $this->metaphone($val);
1246 $this->wordcount++;
1247 }
1248 }
1249
1250 /**
1251 * Calculates relevant information for bodycontent
1252 *
1253 * @param array Index array, passed by reference
1254 * @param array Standard content array
1255 * @return void
1256 */
1257 function analyzeBody(&$retArr,$content) {
1258 foreach($content['body'] as $key => $val) {
1259 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1260 if(!isset($retArr[$val])) {
1261 $retArr[$val]['first'] = $key;
1262 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1263 $retArr[$val]['metaphone'] = $this->metaphone($val);
1264 }
1265 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1266 $this->wordcount++;
1267 }
1268 }
1269
1270 /**
1271 * Creating metaphone based hash from input word
1272 *
1273 * @param string Word to convert
1274 * @param boolean If set, returns the raw metaphone value (not hashed)
1275 * @return mixed Metaphone hash integer (or raw value, string)
1276 */
1277 function metaphone($word,$retRaw=FALSE) {
1278
1279 if (is_object($this->metaphoneObj)) {
1280 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1281 } else {
1282 $tmp = metaphone($word);
1283 }
1284
1285 // Return raw value?
1286 if ($retRaw) return $tmp;
1287
1288 // Otherwise create hash and return integer
1289 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
1290 return $ret;
1291 }
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308 /********************************
1309 *
1310 * SQL; TYPO3 Pages
1311 *
1312 *******************************/
1313
1314 /**
1315 * Updates db with information about the page (TYPO3 page, not external media)
1316 *
1317 * @return void
1318 */
1319 function submitPage() {
1320
1321 // Remove any current data for this phash:
1322 $this->removeOldIndexedPages($this->hash['phash']);
1323
1324 // setting new phash_row
1325 $fields = array(
1326 'phash' => $this->hash['phash'],
1327 'phash_grouping' => $this->hash['phash_grouping'],
1328 'cHashParams' => serialize($this->cHashParams),
1329 'contentHash' => $this->content_md5h,
1330 'data_page_id' => $this->conf['id'],
1331 'data_page_reg1' => $this->conf['page_cache_reg1'],
1332 'data_page_type' => $this->conf['type'],
1333 'data_page_mp' => $this->conf['MP'],
1334 'gr_list' => $this->conf['gr_list'],
1335 'item_type' => 0, // TYPO3 page
1336 'item_title' => $this->contentParts['title'],
1337 'item_description' => $this->bodyDescription($this->contentParts),
1338 'item_mtime' => $this->conf['mtime'],
1339 'item_size' => strlen($this->conf['content']),
1340 'tstamp' => time(),
1341 'crdate' => time(),
1342 'item_crdate' => $this->conf['crdate'], // Creation date of page
1343 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1344 'externalUrl' => 0,
1345 'recordUid' => intval($this->conf['recordUid']),
1346 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1347 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1348 );
1349
1350 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1351
1352 // PROCESSING index_section
1353 $this->submit_section($this->hash['phash'],$this->hash['phash']);
1354
1355 // PROCESSING index_grlist
1356 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
1357
1358 // PROCESSING index_fulltext
1359 $fields = array(
1360 'phash' => $this->hash['phash'],
1361 'fulltextdata' => implode(' ', $this->contentParts)
1362 );
1363 if ($this->indexerConfig['fullTextDataLength']>0) {
1364 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1365 }
1366 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1367
1368 // PROCESSING index_debug
1369 if ($this->indexerConfig['debugMode']) {
1370 $fields = array(
1371 'phash' => $this->hash['phash'],
1372 'debuginfo' => serialize(array(
1373 'cHashParams' => $this->cHashParams,
1374 'external_parsers initialized' => array_keys($this->external_parsers),
1375 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
1376 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
1377 'logs' => $this->internal_log,
1378 'lexer' => $this->lexerObj->debugString,
1379 ))
1380 );
1381 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1382 }
1383 }
1384
1385 /**
1386 * Stores gr_list in the database.
1387 *
1388 * @param integer Search result record phash
1389 * @param integer Actual phash of current content
1390 * @return void
1391 * @see update_grlist()
1392 */
1393 function submit_grlist($hash,$phash_x) {
1394
1395 // Setting the gr_list record
1396 $fields = array(
1397 'phash' => $hash,
1398 'phash_x' => $phash_x,
1399 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
1400 'gr_list' => $this->conf['gr_list']
1401 );
1402 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1403 }
1404
1405 /**
1406 * Stores section
1407 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1408 *
1409 * @param integer phash of TYPO3 parent search result record
1410 * @param integer phash of the file indexation search record
1411 * @return void
1412 */
1413 function submit_section($hash,$hash_t3) {
1414 $fields = array(
1415 'phash' => $hash,
1416 'phash_t3' => $hash_t3,
1417 'page_id' => intval($this->conf['id'])
1418 );
1419
1420 $this->getRootLineFields($fields);
1421
1422 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1423 }
1424
1425 /**
1426 * Removes records for the indexed page, $phash
1427 *
1428 * @param integer phash value to flush
1429 * @return void
1430 */
1431 function removeOldIndexedPages($phash) {
1432 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1433 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
1434 foreach($tableArr as $table) {
1435 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1436 }
1437 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1438 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
1439 }
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453 /********************************
1454 *
1455 * SQL; External media
1456 *
1457 *******************************/
1458
1459
1460 /**
1461 * Updates db with information about the file
1462 *
1463 * @param array Array with phash and phash_grouping keys for file
1464 * @param string File name
1465 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1466 * @param string File extension determining the type of media.
1467 * @param integer Modification time of file.
1468 * @param integer Creation time of file.
1469 * @param integer Size of file in bytes
1470 * @param integer Content HASH value.
1471 * @param array Standard content array (using only title and body for a file)
1472 * @return void
1473 */
1474 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
1475
1476 // Find item Type:
1477 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1478 $storeItemType = $storeItemType ? $storeItemType : $ext;
1479
1480 // Remove any current data for this phash:
1481 $this->removeOldIndexedFiles($hash['phash']);
1482
1483 // Split filename:
1484 $fileParts = parse_url($file);
1485
1486 // Setting new
1487 $fields = array(
1488 'phash' => $hash['phash'],
1489 'phash_grouping' => $hash['phash_grouping'],
1490 'cHashParams' => serialize($subinfo),
1491 'contentHash' => $content_md5h,
1492 'data_filename' => $file,
1493 'item_type' => $storeItemType,
1494 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1495 'item_description' => $this->bodyDescription($contentParts),
1496 'item_mtime' => $mtime,
1497 'item_size' => $size,
1498 'item_crdate' => $ctime,
1499 'tstamp' => time(),
1500 'crdate' => time(),
1501 'gr_list' => $this->conf['gr_list'],
1502 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1503 'recordUid' => intval($this->conf['recordUid']),
1504 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1505 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1506 );
1507 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1508
1509 // PROCESSING index_fulltext
1510 $fields = array(
1511 'phash' => $hash['phash'],
1512 'fulltextdata' => implode(' ', $contentParts)
1513 );
1514 if ($this->indexerConfig['fullTextDataLength']>0) {
1515 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1516 }
1517 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1518
1519 // PROCESSING index_debug
1520 if ($this->indexerConfig['debugMode']) {
1521 $fields = array(
1522 'phash' => $hash['phash'],
1523 'debuginfo' => serialize(array(
1524 'cHashParams' => $subinfo,
1525 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
1526 'logs' => $this->internal_log,
1527 'lexer' => $this->lexerObj->debugString,
1528 ))
1529 );
1530 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1531 }
1532 }
1533
1534 /**
1535 * Stores file gr_list for a file IF it does not exist already
1536 *
1537 * @param integer phash value of file
1538 * @return void
1539 */
1540 function submitFile_grlist($hash) {
1541 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1542 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
1543 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1544 $this->submit_grlist($hash,$hash);
1545 }
1546 }
1547
1548 /**
1549 * Stores file section for a file IF it does not exist
1550 *
1551 * @param integer phash value of file
1552 * @return void
1553 */
1554 function submitFile_section($hash) {
1555 // Testing if there is a section
1556 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
1557 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1558 $this->submit_section($hash,$this->hash['phash']);
1559 }
1560 }
1561
1562 /**
1563 * Removes records for the indexed page, $phash
1564 *
1565 * @param integer phash value to flush
1566 * @return void
1567 */
1568 function removeOldIndexedFiles($phash) {
1569
1570 // Removing old registrations for tables.
1571 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
1572 foreach($tableArr as $table) {
1573 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1574 }
1575 }
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590 /********************************
1591 *
1592 * SQL Helper functions
1593 *
1594 *******************************/
1595
1596 /**
1597 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1598 * Return positive integer if the page needs to being indexed!
1599 *
1600 * @param integer mtime value to test against limits and indexed page.
1601 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1602 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceed and so indexing cannot occur. -1) Mtimes matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1603 */
1604 function checkMtimeTstamp($mtime,$phash) {
1605
1606 // Select indexed page:
1607 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
1608 $out = 0;
1609
1610 // If there was an indexing of the page...:
1611 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1612 if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time()) { // If max age is exceeded, index the page
1613 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed.
1614 } else {
1615 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time()) { // if minAge is not set or if minAge is exceeded, consider at mtime
1616 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1617 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index.
1618 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1619 } else {
1620 $out = -1; // mtime matched the document, so no changes detected and no content updated
1621 if ($this->tstamp_maxAge) {
1622 $this->log_setTSlogMessage('Mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
1623 } else {
1624 $this->updateTstamp($phash); // Update the timestatmp
1625 $this->log_setTSlogMessage('Mtime matched, timestamp updated.',1);
1626 }
1627 }
1628 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed.
1629 } else {$out = -2;} // The minimum age was not exceeded
1630 }
1631 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table).
1632 return $out;
1633 }
1634
1635 /**
1636 * Check content hash in phash table
1637 *
1638 * @return mixed Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1639 */
1640 function checkContentHash() {
1641 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1642 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
1643 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1644 return $row;
1645 }
1646 return 1;
1647 }
1648
1649 /**
1650 * Check content hash for external documents
1651 * Returns true if the document needs to be indexed (that is, there was no result)
1652 *
1653 * @param integer phash value to check (phash_grouping)
1654 * @param integer Content hash to check
1655 * @return boolean Returns true if the document needs to be indexed (that is, there was no result)
1656 */
1657 function checkExternalDocContentHash($hashGr,$content_md5h) {
1658 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
1659 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1660 return 0;
1661 }
1662 return 1;
1663 }
1664
1665 /**
1666 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1667 *
1668 * @param integer Phash integer to test.
1669 * @return void
1670 */
1671 function is_grlist_set($phash_x) {
1672 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
1673 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
1674 }
1675
1676 /**
1677 * Check if an grlist-entry for this hash exists and if not so, write one.
1678 *
1679 * @param integer phash of the search result that should be found
1680 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1681 * @return void
1682 * @see submit_grlist()
1683 */
1684 function update_grlist($phash,$phash_x) {
1685 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
1686 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1687 $this->submit_grlist($phash,$phash_x);
1688 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
1689 }
1690 }
1691
1692 /**
1693 * Update tstamp for a phash row.
1694 *
1695 * @param integer phash value
1696 * @param integer If set, update the mtime field to this value.
1697 * @return void
1698 */
1699 function updateTstamp($phash,$mtime=0) {
1700 $updateFields = array(
1701 'tstamp' => time()
1702 );
1703 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
1704
1705 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1706 }
1707
1708 /**
1709 * Update SetID of the index_phash record.
1710 *
1711 * @param integer phash value
1712 * @return void
1713 */
1714 function updateSetId($phash) {
1715 $updateFields = array(
1716 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1717 );
1718
1719 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1720 }
1721
1722 /**
1723 * Update parsetime for phash row.
1724 *
1725 * @param integer phash value.
1726 * @param integer Parsetime value to set.
1727 * @return void
1728 */
1729 function updateParsetime($phash,$parsetime) {
1730 $updateFields = array(
1731 'parsetime' => intval($parsetime)
1732 );
1733
1734 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1735 }
1736
1737 /**
1738 * Update section rootline for the page
1739 *
1740 * @return void
1741 */
1742 function updateRootline() {
1743
1744 $updateFields = array();
1745 $this->getRootLineFields($updateFields);
1746
1747 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
1748 }
1749
1750 /**
1751 * Adding values for root-line fields.
1752 * rl0, rl1 and rl2 are standard. A hook might add more.
1753 *
1754 * @param array Field array, passed by reference
1755 * @return void
1756 */
1757 function getRootLineFields(&$fieldArr) {
1758
1759 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
1760 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
1761 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
1762
1763 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1764 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1765 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1766 }
1767 }
1768 }
1769
1770 /**
1771 * Removes any indexed pages with userlogins which has the same contentHash
1772 * NOT USED anywhere inside this class!
1773 *
1774 * @return void
1775 */
1776 function removeLoginpagesWithContentHash() {
1777 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
1778 A.phash=B.phash
1779 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
1780 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
1781 AND A.contentHash='.intval($this->content_md5h));
1782 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1783 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
1784 $this->removeOldIndexedPages($row['phash']);
1785 }
1786 }
1787
1788 /**
1789 * Includes the crawler class
1790 *
1791 * @return void
1792 */
1793 function includeCrawlerClass() {
1794 global $TYPO3_CONF_VARS;
1795
1796 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
1797 }
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808 /********************************
1809 *
1810 * SQL; Submitting words
1811 *
1812 *******************************/
1813
1814 /**
1815 * Adds new words to db
1816 *
1817 * @param array Word List array (where each word has information about position etc).
1818 * @return void
1819 */
1820 function checkWordList($wl) {
1821 reset($wl);
1822 $phashArr = array();
1823 while(list($key,) = each($wl)) {
1824 $phashArr[] = $wl[$key]['hash'];
1825 }
1826 if (count($phashArr)) {
1827 $cwl = implode(',',$phashArr);
1828 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
1829
1830 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
1831 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
1832 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1833 unset($wl[$row['baseword']]);
1834 }
1835
1836 reset($wl);
1837 while(list($key,$val)=each($wl)) {
1838 $insertFields = array(
1839 'wid' => $val['hash'],
1840 'baseword' => $key,
1841 'metaphone' => $val['metaphone']
1842 );
1843 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1844 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1845 }
1846 }
1847 }
1848 }
1849
1850 /**
1851 * Submits RELATIONS between words and phash
1852 *
1853 * @param array Word list array
1854 * @param integer phash value
1855 * @return void
1856 */
1857 function submitWords($wl,$phash) {
1858 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
1859
1860 foreach($wl as $val) {
1861 $insertFields = array(
1862 'phash' => $phash,
1863 'wid' => $val['hash'],
1864 'count' => $val['count'],
1865 'first' => $val['first'],
1866 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
1867 'flags' => ($val['cmp'] & $this->flagBitMask)
1868 );
1869
1870 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
1871 }
1872 }
1873
1874 /**
1875 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1876 * and back.
1877 *
1878 * @param double Frequency
1879 * @return integer Frequency in range.
1880 */
1881 function freqMap($freq) {
1882 $mapFactor = $this->freqMax*100*$this->freqRange;
1883 if($freq<1) {
1884 $newFreq = $freq*$mapFactor;
1885 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
1886 } else {
1887 $newFreq = $freq/$mapFactor;
1888 }
1889 return $newFreq;
1890
1891 }
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903 /********************************
1904 *
1905 * Hashing
1906 *
1907 *******************************/
1908
1909 /**
1910 * Get search hash, T3 pages
1911 *
1912 * @return void
1913 */
1914 function setT3Hashes() {
1915
1916 // Set main array:
1917 $hArray = array(
1918 'id' => (integer)$this->conf['id'],
1919 'type' => (integer)$this->conf['type'],
1920 'sys_lang' => (integer)$this->conf['sys_language_uid'],
1921 'MP' => (string)$this->conf['MP'],
1922 'cHash' => $this->cHashParams
1923 );
1924
1925 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1926 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1927
1928 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1929 $hArray['gr_list'] = (string)$this->conf['gr_list'];
1930 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
1931 }
1932
1933 /**
1934 * Get search hash, external files
1935 *
1936 * @param string File name / path which identifies it on the server
1937 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
1938 * @return array Array with "phash_grouping" and "phash" inside.
1939 */
1940 function setExtHashes($file,$subinfo=array()) {
1941 // Set main array:
1942 $hash = array();
1943 $hArray = array(
1944 'file' => $file,
1945 );
1946
1947 // Set grouping hash:
1948 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1949
1950 // Add subinfo
1951 $hArray['subinfo'] = $subinfo;
1952 $hash['phash'] = $this->md5inthash(serialize($hArray));
1953
1954 return $hash;
1955 }
1956
1957 /**
1958 * md5 integer hash
1959 * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
1960 *
1961 * @param string String to hash
1962 * @return integer Integer intepretation of the md5 hash of input string.
1963 */
1964 function md5inthash($str) {
1965 return hexdec(substr(md5($str),0,7));
1966 }
1967
1968 /**
1969 * Calculates the cHash value of input GET array (for constructing cHash values if needed)
1970 *
1971 * @param array Array of GET parameters to encode
1972 * @return void
1973 */
1974 function makeCHash($paramArray) {
1975 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
1976
1977 $pA = t3lib_div::cHashParams($addQueryParams);
1978
1979 return t3lib_div::shortMD5(serialize($pA));
1980 }
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993 /*********************************
1994 *
1995 * Internal logging functions
1996 *
1997 *********************************/
1998
1999 /**
2000 * Push function wrapper for TT logging
2001 *
2002 * @param string Title to set
2003 * @param string Key (?)
2004 * @return void
2005 */
2006 function log_push($msg,$key) {
2007 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
2008 }
2009
2010 /**
2011 * Pull function wrapper for TT logging
2012 *
2013 * @return void
2014 */
2015 function log_pull() {
2016 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
2017 }
2018
2019 /**
2020 * Set log message function wrapper for TT logging
2021 *
2022 * @param string Message to set
2023 * @param integer Error number
2024 * @return void
2025 */
2026 function log_setTSlogMessage($msg, $errorNum=0) {
2027 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
2028 $this->internal_log[] = $msg;
2029 }
2030
2031
2032
2033
2034
2035
2036
2037
2038 /**************************
2039 *
2040 * tslib_fe hooks:
2041 *
2042 **************************/
2043
2044 /**
2045 * Frontend hook: If the page is not being re-generated this is our chance to force it to be (because re-generation of the page is required in order to have the indexer called!)
2046 *
2047 * @param array Parameters from frontend
2048 * @param object TSFE object (reference under PHP5)
2049 * @return void
2050 */
2051 function fe_headerNoCache(&$params, $ref) {
2052
2053 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
2054 if (t3lib_extMgm::isLoaded('crawler')
2055 && $params['pObj']->applicationData['tx_crawler']['running']
2056 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions'])) {
2057
2058 // Setting simple log entry:
2059 $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData'];
2060
2061 // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached.
2062 $params['disableAcquireCacheData'] = TRUE;
2063 }
2064 }
2065 }
2066
2067
2068 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
2069 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
2070 }
2071 ?>