70c6cb60584041d8ac6de4a68ef5f1ea0e080abf
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 141: class tx_indexedsearch_indexer
39 * 207: function hook_indexContent(&$pObj)
40 *
41 * SECTION: Backend API
42 * 308: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
43 * 347: function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
44 * 365: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
45 *
46 * SECTION: Initialization
47 * 416: function init()
48 * 468: function initializeExternalParsers()
49 *
50 * SECTION: Indexing; TYPO3 pages (HTML content)
51 * 509: function indexTypo3PageContent()
52 * 596: function splitHTMLContent($content)
53 * 642: function getHTMLcharset($content)
54 * 657: function convertHTMLToUtf8($content,$charset='')
55 * 685: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
56 * 712: function typoSearchTags(&$body)
57 * 741: function extractLinks($content)
58 * 812: function extractHyperLinks($string)
59 *
60 * SECTION: Indexing; external URL
61 * 871: function indexExternalUrl($externalUrl)
62 * 902: function getUrlHeaders($url)
63 *
64 * SECTION: Indexing; external files (PDF, DOC, etc)
65 * 948: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
66 * 1054: function readFileContent($ext,$absFile,$cPKey)
67 * 1071: function fileContentParts($ext,$absFile)
68 * 1089: function splitRegularContent($content)
69 *
70 * SECTION: Analysing content, Extracting words
71 * 1122: function charsetEntity2utf8(&$contentArr, $charset)
72 * 1145: function processWordsInArrays($contentArr)
73 * 1170: function procesWordsInArrays($contentArr)
74 * 1180: function bodyDescription($contentArr)
75 * 1202: function indexAnalyze($content)
76 * 1223: function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
77 * 1242: function analyzeBody(&$retArr,$content)
78 * 1262: function metaphone($word,$retRaw=FALSE)
79 *
80 * SECTION: SQL; TYPO3 Pages
81 * 1304: function submitPage()
82 * 1378: function submit_grlist($hash,$phash_x)
83 * 1398: function submit_section($hash,$hash_t3)
84 * 1416: function removeOldIndexedPages($phash)
85 *
86 * SECTION: SQL; External media
87 * 1459: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
88 * 1525: function submitFile_grlist($hash)
89 * 1539: function submitFile_section($hash)
90 * 1553: function removeOldIndexedFiles($phash)
91 *
92 * SECTION: SQL Helper functions
93 * 1589: function checkMtimeTstamp($mtime,$phash)
94 * 1625: function checkContentHash()
95 * 1642: function checkExternalDocContentHash($hashGr,$content_md5h)
96 * 1656: function is_grlist_set($phash_x)
97 * 1669: function update_grlist($phash,$phash_x)
98 * 1684: function updateTstamp($phash,$mtime=0)
99 * 1699: function updateSetId($phash)
100 * 1714: function updateParsetime($phash,$parsetime)
101 * 1727: function updateRootline()
102 * 1742: function getRootLineFields(&$fieldArr)
103 * 1761: function removeLoginpagesWithContentHash()
104 * 1778: function includeCrawlerClass()
105 *
106 * SECTION: SQL; Submitting words
107 * 1805: function checkWordList($wl)
108 * 1842: function submitWords($wl,$phash)
109 * 1866: function freqMap($freq)
110 *
111 * SECTION: Hashing
112 * 1899: function setT3Hashes()
113 * 1925: function setExtHashes($file,$subinfo=array())
114 * 1949: function md5inthash($str)
115 * 1959: function makeCHash($paramArray)
116 *
117 * SECTION: Internal logging functions
118 * 1991: function log_push($msg,$key)
119 * 2000: function log_pull()
120 * 2011: function log_setTSlogMessage($msg, $errorNum=0)
121 *
122 * SECTION: tslib_fe hooks:
123 * 2036: function fe_headerNoCache(&$params, $ref)
124 *
125 * TOTAL FUNCTIONS: 59
126 * (This index is automatically created/updated by the extension "extdeveval")
127 *
128 */
129 /**
130 * Indexing class for TYPO3 frontend
131 *
132 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
133 * @package TYPO3
134 * @subpackage tx_indexedsearch
135 */
136 class tx_indexedsearch_indexer {
137
138 // Messages:
139 var $reasons = array(
140 -1 => 'mtime matched the document, so no changes detected and no content updated',
141 -2 => 'The minimum age was not exceeded',
142 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
143 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
144 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
145 4 => 'Page has never been indexed (is not represented in the index_phash table).'
146 );
147
148 // HTML code blocks to exclude from indexing:
149 var $excludeSections = 'script,style';
150
151 // Supported Extensions for external files:
152 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
153
154 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
155 var $defaultGrList = '0,-1';
156
157 // Min/Max times:
158 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
159 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
160 var $maxExternalFiles = 0; // Max number of external files to index.
161
162 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc.
163 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
164
165 // INTERNALS:
166 var $defaultContentArray=array(
167 'title' => '',
168 'description' => '',
169 'keywords' => '',
170 'body' => '',
171 );
172 var $wordcount = 0;
173 var $externalFileCounter = 0;
174
175 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
176 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
177 var $hash = array(); // Hash array, contains phash and phash_grouping
178 var $file_phash_arr = array(); // Hash array for files
179 var $contentParts = array(); // Content of TYPO3 page
180 var $content_md5h = '';
181 var $internal_log = array(); // Internal log
182 var $indexExternalUrl_content = '';
183
184 var $cHashParams = array(); // cHashparams array
185
186 var $freqRange = 32000;
187 var $freqMax = 0.1;
188
189 // Objects:
190 /**
191 * Charset class object
192 *
193 * @var t3lib_cs
194 */
195 var $csObj;
196
197 /**
198 * Metaphone object, if any
199 *
200 * @var user_DoubleMetaPhone
201 */
202 var $metaphoneObj;
203
204 /**
205 * Lexer object for word splitting
206 *
207 * @var tx_indexedsearch_lexer
208 */
209 var $lexerObj;
210
211
212
213 /**
214 * Parent Object (TSFE) Initialization
215 *
216 * @param object Parent Object (frontend TSFE object), passed by reference
217 * @return void
218 */
219 function hook_indexContent(&$pObj) {
220
221 // Indexer configuration from Extension Manager interface:
222 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
223
224 // Crawler activation:
225 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
226 if (t3lib_extMgm::isLoaded('crawler')
227 && $pObj->applicationData['tx_crawler']['running']
228 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
229
230 // Setting simple log message:
231 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
232
233 // Setting variables:
234 $this->crawlerActive = TRUE; // Crawler active flag
235 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
236 }
237
238 // Determine if page should be indexed, and if so, configure and initialize indexer
239 if ($pObj->config['config']['index_enable']) {
240 $this->log_push('Index page','');
241
242 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
243 if (!$pObj->page['no_search']) {
244 if (!$pObj->no_cache) {
245 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
246
247 // Setting up internal configuration from config array:
248 $this->conf = array();
249
250 // Information about page for which the indexing takes place
251 $this->conf['id'] = $pObj->id; // Page id
252 $this->conf['type'] = $pObj->type; // Page type
253 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
254 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
255 $this->conf['gr_list'] = $pObj->gr_list; // Group list
256
257 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
258 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
259
260 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
261 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
262
263 // Root line uids
264 $this->conf['rootline_uids'] = array();
265 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
266 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
267 }
268
269 // Content of page:
270 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
271 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
272 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
273 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
274
275 // Configuration of behavior:
276 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
277 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
278 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
279
280 // Set to zero:
281 $this->conf['recordUid'] = 0;
282 $this->conf['freeIndexUid'] = 0;
283 $this->conf['freeIndexSetId'] = 0;
284
285 // Init and start indexing:
286 $this->init();
287 $this->indexTypo3PageContent();
288 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
289 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
290 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
291 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
292 $this->log_pull();
293 }
294 }
295
296
297
298
299
300
301
302
303 /****************************
304 *
305 * Backend API
306 *
307 ****************************/
308
309 /**
310 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
311 *
312 * @param integer The page uid, &id=
313 * @param integer The page type, &type=
314 * @param integer sys_language uid, typically &L=
315 * @param string The MP variable (Mount Points), &MP=
316 * @param array Rootline array of only UIDs.
317 * @param array Array of GET variables to register with this indexing
318 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
319 * @return void
320 */
321 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
322
323 // Setting up internal configuration from config array:
324 $this->conf = array();
325
326 // Information about page for which the indexing takes place
327 $this->conf['id'] = $id; // Page id (integer)
328 $this->conf['type'] = $type; // Page type (integer)
329 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
330 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
331 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
332
333 // cHash values:
334 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : ''; // cHash string for additional parameters
335 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
336
337 // Set to defaults
338 $this->conf['freeIndexUid'] = 0;
339 $this->conf['freeIndexSetId'] = 0;
340 $this->conf['page_cache_reg1'] = '';
341
342 // Root line uids
343 $this->conf['rootline_uids'] = $uidRL;
344
345 // Configuration of behavior:
346 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
347 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
348 $this->conf['index_metatags'] = true; // Whether to index document keywords and description (if present)
349
350 // Init and start indexing:
351 $this->init();
352 }
353
354 /**
355 * Sets the free-index uid. Can be called right after backend_initIndexer()
356 *
357 * @param integer Free index UID
358 * @param integer Set id - an integer identifying the "set" of indexing operations.
359 * @return void
360 */
361 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
362 $this->conf['freeIndexUid'] = $freeIndexUid;
363 $this->conf['freeIndexSetId'] = $freeIndexSetId;
364 }
365
366 /**
367 * Indexing records as the content of a TYPO3 page.
368 *
369 * @param string Title equivalent
370 * @param string Keywords equivalent
371 * @param string Description equivalent
372 * @param string The main content to index
373 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
374 * @param integer Last modification time, in seconds
375 * @param integer The creation date of the content, in seconds
376 * @param integer The record UID that the content comes from (for registration with the indexed rows)
377 * @return void
378 */
379 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
380
381 // Content of page:
382 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
383 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
384 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
385
386 // Construct fake HTML for parsing:
387 $this->conf['content'] = '
388 <html>
389 <head>
390 <title>'.htmlspecialchars($title).'</title>
391 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
392 <meta name="description" content="'.htmlspecialchars($description).'" />
393 </head>
394 <body>
395 '.htmlspecialchars($content).'
396 </body>
397 </html>'; // Content string (HTML of TYPO3 page)
398
399 // Initializing charset:
400 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
401 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
402
403 // Index content as if it was a TYPO3 page:
404 $this->indexTypo3PageContent();
405 }
406
407
408
409
410
411
412
413
414
415
416
417
418
419 /********************************
420 *
421 * Initialization
422 *
423 *******************************/
424
425 /**
426 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
427 *
428 * @return void
429 */
430 function init() {
431 global $TYPO3_CONF_VARS;
432
433 // Initializing:
434 $this->cHashParams = $this->conf['cHash_array'];
435 if (is_array($this->cHashParams) && count($this->cHashParams)) {
436 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
437 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
438 }
439
440 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
441 $this->setT3Hashes();
442
443 // Indexer configuration from Extension Manager interface:
444 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
445 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
446 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
447 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
448 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
449
450 // Initialize external document parsers:
451 // Example configuration, see ext_localconf.php of this file!
452 if ($this->conf['index_externals']) {
453 $this->initializeExternalParsers();
454 }
455
456 // Initialize lexer (class that deconstructs the text into words):
457 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
458 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
459 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
460 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
461 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
462 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
463
464 // Initialize metaphone hook:
465 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
466 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
467 $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
468 $this->metaphoneObj->pObj = &$this;
469 }
470
471 // Init charset class:
472 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
473 }
474
475 /**
476 * Initialize external parsers
477 *
478 * @return void
479 * @access private
480 * @see init()
481 */
482 function initializeExternalParsers() {
483 global $TYPO3_CONF_VARS;
484
485 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
486 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
487 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
488 $this->external_parsers[$extension]->pObj = &$this;
489
490 // Init parser and if it returns false, unset its entry again:
491 if (!$this->external_parsers[$extension]->initParser($extension)) {
492 unset($this->external_parsers[$extension]);
493 }
494 }
495 }
496 }
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512 /********************************
513 *
514 * Indexing; TYPO3 pages (HTML content)
515 *
516 *******************************/
517
518 /**
519 * Start indexing of the TYPO3 page
520 *
521 * @return void
522 */
523 function indexTypo3PageContent() {
524
525 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
526 $is_grlist = $this->is_grlist_set($this->hash['phash']);
527
528 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
529
530 // Setting message:
531 if ($this->forceIndexing) {
532 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
533 } elseif ($check > 0) {
534 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
535 } else {
536 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
537 }
538
539 // Divide into title,keywords,description and body:
540 $this->log_push('Split content','');
541 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
542 if ($this->conf['indexedDocTitle']) {
543 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
544 }
545 $this->log_pull();
546
547 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
548 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
549
550 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
551 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
552 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
553 $checkCHash = $this->checkContentHash();
554 if (!is_array($checkCHash) || $check===1) {
555 $Pstart=t3lib_div::milliseconds();
556
557 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
558 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
559 $this->log_pull();
560
561 // Splitting words
562 $this->log_push('Extract words from content','');
563 $splitInWords = $this->processWordsInArrays($this->contentParts);
564 $this->log_pull();
565
566 // Analyse the indexed words.
567 $this->log_push('Analyse the extracted words','');
568 $indexArr = $this->indexAnalyze($splitInWords);
569 $this->log_pull();
570
571 // Submitting page (phash) record
572 $this->log_push('Submitting page','');
573 $this->submitPage();
574 $this->log_pull();
575
576 // Check words and submit to word list if not there
577 $this->log_push('Check word list and submit words','');
578 $this->checkWordList($indexArr);
579 $this->submitWords($indexArr,$this->hash['phash']);
580 $this->log_pull();
581
582 // Set parsetime
583 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
584
585 // Checking external files if configured for.
586 $this->log_push('Checking external files','');
587 if ($this->conf['index_externals']) {
588 $this->extractLinks($this->conf['content']);
589 }
590 $this->log_pull();
591 } else {
592 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
593 $this->updateSetId($this->hash['phash']);
594 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
595 $this->updateRootline();
596 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
597 }
598 } else {
599 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
600 }
601 }
602
603 /**
604 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
605 *
606 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
607 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
608 * @see splitRegularContent()
609 */
610 function splitHTMLContent($content) {
611
612 // divide head from body ( u-ouh :) )
613 $contentArr = $this->defaultContentArray;
614 $contentArr['body'] = stristr($content,'<body');
615 $headPart = substr($content,0,-strlen($contentArr['body']));
616
617 // get title
618 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
619 $titleParts = explode(':',$contentArr['title'],2);
620 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
621
622 // get keywords and description metatags
623 if($this->conf['index_metatags']) {
624 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
625 for($i=0;isset($meta[$i]);$i++) {
626 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
627 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
628 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
629 }
630 }
631
632 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
633 $this->typoSearchTags($contentArr['body']);
634
635 // Get rid of unwanted sections (ie. scripting and style stuff) in body
636 $tagList = explode(',',$this->excludeSections);
637 foreach($tagList as $tag) {
638 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
639 }
640
641 // remove tags, but first make sure we don't concatenate words by doing it
642 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
643 $contentArr['body'] = trim(strip_tags($contentArr['body']));
644
645 $contentArr['keywords'] = trim($contentArr['keywords']);
646 $contentArr['description'] = trim($contentArr['description']);
647
648 // Return array
649 return $contentArr;
650 }
651
652 /**
653 * Extract the charset value from HTML meta tag.
654 *
655 * @param string HTML content
656 * @return string The charset value if found.
657 */
658 function getHTMLcharset($content) {
659 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i',$content,$reg)) {
660 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i',$reg[0],$reg2)) {
661 return $reg2[1];
662 }
663 }
664 }
665
666 /**
667 * Converts a HTML document to utf-8
668 *
669 * @param string HTML content, any charset
670 * @param string Optional charset (otherwise extracted from HTML)
671 * @return string Converted HTML
672 */
673 function convertHTMLToUtf8($content,$charset='') {
674
675 // Find charset:
676 $charset = $charset ? $charset : $this->getHTMLcharset($content);
677 $charset = $this->csObj->parse_charset($charset);
678
679 // Convert charset:
680 if ($charset && $charset!=='utf-8') {
681 $content = $this->csObj->utf8_encode($content, $charset);
682 }
683 // Convert entities, assuming document is now UTF-8:
684 $content = $this->csObj->entities_to_utf8($content, TRUE);
685
686 return $content;
687 }
688
689 /**
690 * Finds first occurence of embracing tags and returns the embraced content and the original string with
691 * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
692 * <title> of document or removing <script>-sections
693 *
694 * @param string String to search in
695 * @param string Tag name, eg. "script"
696 * @param string Passed by reference: Content inside found tag
697 * @param string Passed by reference: Content after found tag
698 * @param string Passed by reference: Attributes of the found tag.
699 * @return boolean Returns false if tag was not found, otherwise true.
700 */
701 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
702 $endTag = '</'.$tagName.'>';
703 $startTag = '<'.$tagName;
704
705 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
706 if(!$isTagInText) return false; // if the tag was not found, return false
707
708 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
709 $afterTagInText = stristr($isTagInText,$endTag);
710 if ($afterTagInText) {
711 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
712 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
713 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
714 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
715 $tagContent='';
716 $stringAfter = $isTagInText;
717 }
718
719 return true;
720 }
721
722 /**
723 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
724 *
725 * @param string HTML Content, passed by reference
726 * @return boolean Returns true if a TYPOSEARCH_ tag was found, otherwise false.
727 */
728 function typoSearchTags(&$body) {
729 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
730
731 if(count($expBody)>1) {
732 $body = '';
733
734 foreach($expBody as $val) {
735 $part = explode('-->',$val,2);
736 if(trim($part[0])=='begin') {
737 $body.= $part[1];
738 $prev = '';
739 } elseif(trim($part[0])=='end') {
740 $body.= $prev;
741 } else {
742 $prev = $val;
743 }
744 }
745 return true;
746 } else {
747 return false;
748 }
749 }
750
751 /**
752 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
753 *
754 * @param string HTML content
755 * @return void
756 */
757 function extractLinks($content) {
758
759 // Get links:
760 $list = $this->extractHyperLinks($content);
761
762 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
763 $this->includeCrawlerClass();
764 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
765 }
766
767 // Traverse links:
768 foreach($list as $linkInfo) {
769
770 // Decode entities:
771 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
772 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
773 } else {
774 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
775 }
776
777 // Parse URL:
778 $qParts = parse_url($linkSource);
779
780 // Check for jumpurl (TYPO3 specific thing...)
781 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
782 parse_str($qParts['query'],$getP);
783 $linkSource = $getP['jumpurl'];
784 $qParts = parse_url($linkSource); // parse again due to new linkSource!
785 }
786
787 if ($qParts['scheme']) {
788 if ($this->indexerConfig['indexExternalURLs']) {
789 // Index external URL (http or otherwise)
790 $this->indexExternalUrl($linkSource);
791 }
792 } elseif (!$qParts['query']) {
793 if (t3lib_div::isAllowedAbsPath($linkSource)) {
794 $localFile = $linkSource;
795 } else {
796 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
797 }
798 if ($localFile && @is_file($localFile)) {
799
800 // Index local file:
801 if ($linkInfo['localPath']) {
802
803 $fI = pathinfo($linkSource);
804 $ext = strtolower($fI['extension']);
805 if (is_object($crawler)) {
806 $params = array(
807 'document' => $linkSource,
808 'alturl' => $linkInfo['href'],
809 'conf' => $this->conf
810 );
811 unset($params['conf']['content']);
812
813 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
814 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
815 } else {
816 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
817 }
818 } else {
819 if (is_object($crawler)) {
820 $params = array(
821 'document' => $linkSource,
822 'conf' => $this->conf
823 );
824 unset($params['conf']['content']);
825 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
826 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
827 } else {
828 $this->indexRegularDocument($linkSource);
829 }
830 }
831 }
832 }
833 }
834 }
835
836 /**
837 * Extracts all links to external documents from content string.
838 *
839 * @param string Content to analyse
840 * @return array Array of hyperlinks
841 * @see extractLinks()
842 */
843 function extractHyperLinks($string) {
844 if (!is_object($this->htmlParser)) {
845 $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
846 }
847
848 $parts = $this->htmlParser->splitTags('a',$string);
849 $list = array();
850 foreach ($parts as $k => $v) {
851 if ($k%2) {
852 $params = $this->htmlParser->get_tag_attributes($v,1);
853 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
854
855 switch (strtolower($firstTagName)) {
856 case 'a':
857 $src = $params[0]['href'];
858 if ($src) {
859 // Check if a local path to that file has been set - useful if you are using a download script.
860 $md5 = t3lib_div::shortMD5($src);
861 if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'])) {
862 $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';
863 } else $localPath=false;
864
865 $list[] = array(
866 'tag' => $v,
867 'href' => $params[0]['href'],
868 'localPath' => $localPath
869 );
870 }
871 break;
872 }
873 }
874 }
875
876 return $list;
877 }
878
879
880
881
882
883
884
885
886
887
888
889 /******************************************
890 *
891 * Indexing; external URL
892 *
893 ******************************************/
894
895 /**
896 * Index External URLs HTML content
897 *
898 * @param string URL, eg. "http://typo3.org/"
899 * @return void
900 * @see indexRegularDocument()
901 */
902 function indexExternalUrl($externalUrl) {
903
904 // Parse External URL:
905 $qParts = parse_url($externalUrl);
906 $fI = pathinfo($qParts['path']);
907 $ext = strtolower($fI['extension']);
908
909 // Get headers:
910 $urlHeaders = $this->getUrlHeaders($externalUrl);
911 if (stristr($urlHeaders['Content-Type'],'text/html')) {
912 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
913 if (strlen($content)) {
914
915 // Create temporary file:
916 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
917 t3lib_div::writeFile($tmpFile, $content);
918
919 // Index that file:
920 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
921 unlink($tmpFile);
922 }
923 }
924 }
925
926 /**
927 * Getting HTTP request headers of URL
928 *
929 * @param string The URL
930 * @param integer Timeout (seconds?)
931 * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys
932 */
933 function getUrlHeaders($url) {
934 $content = t3lib_div::getURL($url,2); // Try to get the headers only
935
936 if (strlen($content)) {
937 // Compile headers:
938 $headers = t3lib_div::trimExplode(chr(10),$content,1);
939 $retVal = array();
940 foreach($headers as $line) {
941 if (!strlen(trim($line))) {
942 break; // Stop at the first empty line (= end of header)
943 }
944
945 list($headKey, $headValue) = explode(':', $line, 2);
946 $retVal[$headKey] = $headValue;
947 }
948 return $retVal;
949 }
950 }
951
952
953
954
955
956
957
958
959
960
961
962
963
964 /******************************************
965 *
966 * Indexing; external files (PDF, DOC, etc)
967 *
968 ******************************************/
969
970 /**
971 * Indexing a regular document given as $file (relative to PATH_site, local file)
972 *
973 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
974 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
975 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
976 * @param string File extension for temporary file.
977 * @return void
978 */
979 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
980
981 // Init
982 $fI = pathinfo($file);
983 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
984
985 // Create abs-path:
986 if (!$contentTmpFile) {
987 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
988 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
989 } else { // Absolute, pass-through:
990 $absFile = $file;
991 }
992 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
993 } else {
994 $absFile = $contentTmpFile;
995 }
996
997 // Indexing the document:
998 if ($absFile && @is_file($absFile)) {
999 if ($this->external_parsers[$ext]) {
1000 $mtime = filemtime($absFile);
1001 $cParts = $this->fileContentParts($ext,$absFile);
1002
1003 foreach($cParts as $cPKey) {
1004 $this->internal_log = array();
1005 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
1006 $Pstart = t3lib_div::milliseconds();
1007 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1008 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
1009 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1010 if ($check > 0 || $force) {
1011 if ($check > 0) {
1012 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
1013 } else {
1014 $this->log_setTSlogMessage('Indexing forced by flag',1);
1015 }
1016
1017 // Check external file counter:
1018 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1019
1020 // Divide into title,keywords,description and body:
1021 $this->log_push('Split content','');
1022 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
1023 $this->log_pull();
1024
1025 if (is_array($contentParts)) {
1026 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1027 $content_md5h = $this->md5inthash(implode($contentParts,''));
1028
1029 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1030
1031 // Increment counter:
1032 $this->externalFileCounter++;
1033
1034 // Splitting words
1035 $this->log_push('Extract words from content','');
1036 $splitInWords = $this->processWordsInArrays($contentParts);
1037 $this->log_pull();
1038
1039 // Analyse the indexed words.
1040 $this->log_push('Analyse the extracted words','');
1041 $indexArr = $this->indexAnalyze($splitInWords);
1042 $this->log_pull();
1043
1044 // Submitting page (phash) record
1045 $this->log_push('Submitting page','');
1046 $size = filesize($absFile);
1047 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1048 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
1049 $this->log_pull();
1050
1051 // Check words and submit to word list if not there
1052 $this->log_push('Check word list and submit words','');
1053 $this->checkWordList($indexArr);
1054 $this->submitWords($indexArr,$phash_arr['phash']);
1055 $this->log_pull();
1056
1057 // Set parsetime
1058 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
1059 } else {
1060 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
1061 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
1062 }
1063 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1064 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1065 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1066
1067 // Checking and setting sections:
1068 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
1069 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1070 $this->log_pull();
1071 }
1072 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1073 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1074 }
1075
1076 /**
1077 * Reads the content of an external file being indexed.
1078 * The content from the external parser MUST be returned in utf-8!
1079 *
1080 * @param string File extension, eg. "pdf", "doc" etc.
1081 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1082 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1083 * @return array Standard content array (title, description, keywords, body keys)
1084 */
1085 function readFileContent($ext,$absFile,$cPKey) {
1086
1087 // Consult relevant external document parser:
1088 if (is_object($this->external_parsers[$ext])) {
1089 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
1090 }
1091
1092 return $contentArr;
1093 }
1094
1095 /**
1096 * Creates an array with pointers to divisions of document.
1097 *
1098 * @param string File extension
1099 * @param string Absolute filename (must exist and be validated OK before calling function)
1100 * @return array Array of pointers to sections that the document should be divided into
1101 */
1102 function fileContentParts($ext,$absFile) {
1103 $cParts = array(0);
1104
1105 // Consult relevant external document parser:
1106 if (is_object($this->external_parsers[$ext])) {
1107 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
1108 }
1109
1110 return $cParts;
1111 }
1112
1113 /**
1114 * Splits non-HTML content (from external files for instance)
1115 *
1116 * @param string Input content (non-HTML) to index.
1117 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1118 * @see splitHTMLContent()
1119 */
1120 function splitRegularContent($content) {
1121 $contentArr = $this->defaultContentArray;
1122 $contentArr['body'] = $content;
1123
1124 return $contentArr;
1125 }
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140 /**********************************
1141 *
1142 * Analysing content, Extracting words
1143 *
1144 **********************************/
1145
1146 /**
1147 * Convert character set and HTML entities in the value of input content array keys
1148 *
1149 * @param array Standard content array
1150 * @param string Charset of the input content (converted to utf-8)
1151 * @return void
1152 */
1153 function charsetEntity2utf8(&$contentArr, $charset) {
1154
1155 // Convert charset if necessary
1156 reset($contentArr);
1157 while(list($key,)=each($contentArr)) {
1158 if (strlen($contentArr[$key])) {
1159
1160 if ($charset!=='utf-8') {
1161 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1162 }
1163
1164 // decode all numeric / html-entities in the string to real characters:
1165 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
1166 }
1167 }
1168 }
1169
1170 /**
1171 * Processing words in the array from split*Content -functions
1172 *
1173 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1174 * @return array Content input array modified so each key is not a unique array of words
1175 */
1176 function processWordsInArrays($contentArr) {
1177
1178 // split all parts to words
1179 reset($contentArr);
1180 while(list($key,)=each($contentArr)) {
1181 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1182 }
1183
1184 // For title, keywords, and description we don't want duplicates:
1185 $contentArr['title'] = array_unique($contentArr['title']);
1186 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1187 $contentArr['description'] = array_unique($contentArr['description']);
1188
1189 // Return modified array:
1190 return $contentArr;
1191 }
1192
1193 /**
1194 * Processing words in the array from split*Content -functions
1195 * This function is only a wrapper because the function has been removed (see above).
1196 *
1197 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1198 * @return array Content input array modified so each key is not a unique array of words
1199 * @deprecated since TYPO3 4.0
1200 */
1201 function procesWordsInArrays($contentArr) {
1202 return $this->processWordsInArrays($contentArr);
1203 }
1204
1205 /**
1206 * Extracts the sample description text from the content array.
1207 *
1208 * @param array Content array
1209 * @return string Description string
1210 */
1211 function bodyDescription($contentArr) {
1212
1213 // Setting description
1214 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
1215 if ($maxL) {
1216 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
1217 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
1218 $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
1219
1220 // Shorten the string:
1221 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1222 }
1223
1224 return $bodyDescription;
1225 }
1226
1227 /**
1228 * Analyzes content to use for indexing,
1229 *
1230 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1231 * @return array Index Array (whatever that is...)
1232 */
1233 function indexAnalyze($content) {
1234 $indexArr = Array();
1235 $counter = 0;
1236
1237 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
1238 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
1239 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
1240 $this->analyzeBody($indexArr,$content);
1241
1242 return ($indexArr);
1243 }
1244
1245 /**
1246 * Calculates relevant information for headercontent
1247 *
1248 * @param array Index array, passed by reference
1249 * @param array Standard content array
1250 * @param string Key from standard content array
1251 * @param integer Bit-wise priority to type
1252 * @return void
1253 */
1254 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
1255 reset($content[$key]);
1256 while(list(,$val)=each($content[$key])) {
1257 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1258 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
1259 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1260 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1261 $retArr[$val]['metaphone'] = $this->metaphone($val);
1262 $this->wordcount++;
1263 }
1264 }
1265
1266 /**
1267 * Calculates relevant information for bodycontent
1268 *
1269 * @param array Index array, passed by reference
1270 * @param array Standard content array
1271 * @return void
1272 */
1273 function analyzeBody(&$retArr,$content) {
1274 foreach($content['body'] as $key => $val) {
1275 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1276 if(!isset($retArr[$val])) {
1277 $retArr[$val]['first'] = $key;
1278 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1279 $retArr[$val]['metaphone'] = $this->metaphone($val);
1280 }
1281 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1282 $this->wordcount++;
1283 }
1284 }
1285
1286 /**
1287 * Creating metaphone based hash from input word
1288 *
1289 * @param string Word to convert
1290 * @param boolean If set, returns the raw metaphone value (not hashed)
1291 * @return mixed Metaphone hash integer (or raw value, string)
1292 */
1293 function metaphone($word,$retRaw=FALSE) {
1294
1295 if (is_object($this->metaphoneObj)) {
1296 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1297 } else {
1298 $tmp = metaphone($word);
1299 }
1300
1301 // Return raw value?
1302 if ($retRaw) return $tmp;
1303
1304 // Otherwise create hash and return integer
1305 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
1306 return $ret;
1307 }
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324 /********************************
1325 *
1326 * SQL; TYPO3 Pages
1327 *
1328 *******************************/
1329
1330 /**
1331 * Updates db with information about the page (TYPO3 page, not external media)
1332 *
1333 * @return void
1334 */
1335 function submitPage() {
1336
1337 // Remove any current data for this phash:
1338 $this->removeOldIndexedPages($this->hash['phash']);
1339
1340 // setting new phash_row
1341 $fields = array(
1342 'phash' => $this->hash['phash'],
1343 'phash_grouping' => $this->hash['phash_grouping'],
1344 'cHashParams' => serialize($this->cHashParams),
1345 'contentHash' => $this->content_md5h,
1346 'data_page_id' => $this->conf['id'],
1347 'data_page_reg1' => $this->conf['page_cache_reg1'],
1348 'data_page_type' => $this->conf['type'],
1349 'data_page_mp' => $this->conf['MP'],
1350 'gr_list' => $this->conf['gr_list'],
1351 'item_type' => 0, // TYPO3 page
1352 'item_title' => $this->contentParts['title'],
1353 'item_description' => $this->bodyDescription($this->contentParts),
1354 'item_mtime' => $this->conf['mtime'],
1355 'item_size' => strlen($this->conf['content']),
1356 'tstamp' => time(),
1357 'crdate' => time(),
1358 'item_crdate' => $this->conf['crdate'], // Creation date of page
1359 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1360 'externalUrl' => 0,
1361 'recordUid' => intval($this->conf['recordUid']),
1362 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1363 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1364 );
1365
1366 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1367
1368 // PROCESSING index_section
1369 $this->submit_section($this->hash['phash'],$this->hash['phash']);
1370
1371 // PROCESSING index_grlist
1372 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
1373
1374 // PROCESSING index_fulltext
1375 $fields = array(
1376 'phash' => $this->hash['phash'],
1377 'fulltextdata' => implode(' ', $this->contentParts)
1378 );
1379 if ($this->indexerConfig['fullTextDataLength']>0) {
1380 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1381 }
1382 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1383
1384 // PROCESSING index_debug
1385 if ($this->indexerConfig['debugMode']) {
1386 $fields = array(
1387 'phash' => $this->hash['phash'],
1388 'debuginfo' => serialize(array(
1389 'cHashParams' => $this->cHashParams,
1390 'external_parsers initialized' => array_keys($this->external_parsers),
1391 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
1392 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
1393 'logs' => $this->internal_log,
1394 'lexer' => $this->lexerObj->debugString,
1395 ))
1396 );
1397 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1398 }
1399 }
1400
1401 /**
1402 * Stores gr_list in the database.
1403 *
1404 * @param integer Search result record phash
1405 * @param integer Actual phash of current content
1406 * @return void
1407 * @see update_grlist()
1408 */
1409 function submit_grlist($hash,$phash_x) {
1410
1411 // Setting the gr_list record
1412 $fields = array(
1413 'phash' => $hash,
1414 'phash_x' => $phash_x,
1415 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
1416 'gr_list' => $this->conf['gr_list']
1417 );
1418 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1419 }
1420
1421 /**
1422 * Stores section
1423 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1424 *
1425 * @param integer phash of TYPO3 parent search result record
1426 * @param integer phash of the file indexation search record
1427 * @return void
1428 */
1429 function submit_section($hash,$hash_t3) {
1430 $fields = array(
1431 'phash' => $hash,
1432 'phash_t3' => $hash_t3,
1433 'page_id' => intval($this->conf['id'])
1434 );
1435
1436 $this->getRootLineFields($fields);
1437
1438 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1439 }
1440
1441 /**
1442 * Removes records for the indexed page, $phash
1443 *
1444 * @param integer phash value to flush
1445 * @return void
1446 */
1447 function removeOldIndexedPages($phash) {
1448 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1449 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
1450 foreach($tableArr as $table) {
1451 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1452 }
1453 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1454 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
1455 }
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469 /********************************
1470 *
1471 * SQL; External media
1472 *
1473 *******************************/
1474
1475
1476 /**
1477 * Updates db with information about the file
1478 *
1479 * @param array Array with phash and phash_grouping keys for file
1480 * @param string File name
1481 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1482 * @param string File extension determining the type of media.
1483 * @param integer Modification time of file.
1484 * @param integer Creation time of file.
1485 * @param integer Size of file in bytes
1486 * @param integer Content HASH value.
1487 * @param array Standard content array (using only title and body for a file)
1488 * @return void
1489 */
1490 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
1491
1492 // Find item Type:
1493 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1494 $storeItemType = $storeItemType ? $storeItemType : $ext;
1495
1496 // Remove any current data for this phash:
1497 $this->removeOldIndexedFiles($hash['phash']);
1498
1499 // Split filename:
1500 $fileParts = parse_url($file);
1501
1502 // Setting new
1503 $fields = array(
1504 'phash' => $hash['phash'],
1505 'phash_grouping' => $hash['phash_grouping'],
1506 'cHashParams' => serialize($subinfo),
1507 'contentHash' => $content_md5h,
1508 'data_filename' => $file,
1509 'item_type' => $storeItemType,
1510 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1511 'item_description' => $this->bodyDescription($contentParts),
1512 'item_mtime' => $mtime,
1513 'item_size' => $size,
1514 'item_crdate' => $ctime,
1515 'tstamp' => time(),
1516 'crdate' => time(),
1517 'gr_list' => $this->conf['gr_list'],
1518 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1519 'recordUid' => intval($this->conf['recordUid']),
1520 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1521 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1522 );
1523 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1524
1525 // PROCESSING index_fulltext
1526 $fields = array(
1527 'phash' => $hash['phash'],
1528 'fulltextdata' => implode(' ', $contentParts)
1529 );
1530 if ($this->indexerConfig['fullTextDataLength']>0) {
1531 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1532 }
1533 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1534
1535 // PROCESSING index_debug
1536 if ($this->indexerConfig['debugMode']) {
1537 $fields = array(
1538 'phash' => $hash['phash'],
1539 'debuginfo' => serialize(array(
1540 'cHashParams' => $subinfo,
1541 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
1542 'logs' => $this->internal_log,
1543 'lexer' => $this->lexerObj->debugString,
1544 ))
1545 );
1546 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1547 }
1548 }
1549
1550 /**
1551 * Stores file gr_list for a file IF it does not exist already
1552 *
1553 * @param integer phash value of file
1554 * @return void
1555 */
1556 function submitFile_grlist($hash) {
1557 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1558 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
1559 'phash',
1560 'index_grlist',
1561 'phash=' . intval($hash) .
1562 ' AND (hash_gr_list=' . $this->md5inthash($this->defaultGrList) .
1563 ' OR hash_gr_list=' . $this->md5inthash($this->conf['gr_list']) . ')'
1564 );
1565 if (!$count) {
1566 $this->submit_grlist($hash,$hash);
1567 }
1568 }
1569
1570 /**
1571 * Stores file section for a file IF it does not exist
1572 *
1573 * @param integer phash value of file
1574 * @return void
1575 */
1576 function submitFile_section($hash) {
1577 // Testing if there is a section
1578 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
1579 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1580 $this->submit_section($hash,$this->hash['phash']);
1581 }
1582 }
1583
1584 /**
1585 * Removes records for the indexed page, $phash
1586 *
1587 * @param integer phash value to flush
1588 * @return void
1589 */
1590 function removeOldIndexedFiles($phash) {
1591
1592 // Removing old registrations for tables.
1593 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
1594 foreach($tableArr as $table) {
1595 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1596 }
1597 }
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612 /********************************
1613 *
1614 * SQL Helper functions
1615 *
1616 *******************************/
1617
1618 /**
1619 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1620 * Return positive integer if the page needs to be indexed
1621 *
1622 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1623 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1624 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1625 */
1626 function checkMtimeTstamp($mtime,$phash) {
1627
1628 // Select indexed page:
1629 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
1630 $out = 0;
1631
1632 // If there was an indexing of the page...:
1633 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1634 if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time()) { // If max age is exceeded, index the page
1635 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed.
1636 } else {
1637 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time()) { // if minAge is not set or if minAge is exceeded, consider at mtime
1638 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1639 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index.
1640 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1641 } else {
1642 $out = -1; // mtime matched the document, so no changes detected and no content updated
1643 if ($this->tstamp_maxAge) {
1644 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
1645 } else {
1646 $this->updateTstamp($phash); // Update the timestatmp
1647 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
1648 }
1649 }
1650 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed.
1651 } else {$out = -2;} // The minimum age was not exceeded
1652 }
1653 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table).
1654 return $out;
1655 }
1656
1657 /**
1658 * Check content hash in phash table
1659 *
1660 * @return mixed Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1661 */
1662 function checkContentHash() {
1663 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1664 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
1665 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1666 return $row;
1667 }
1668 return 1;
1669 }
1670
1671 /**
1672 * Check content hash for external documents
1673 * Returns true if the document needs to be indexed (that is, there was no result)
1674 *
1675 * @param integer phash value to check (phash_grouping)
1676 * @param integer Content hash to check
1677 * @return boolean Returns true if the document needs to be indexed (that is, there was no result)
1678 */
1679 function checkExternalDocContentHash($hashGr,$content_md5h) {
1680 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
1681 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1682 return 0;
1683 }
1684 return 1;
1685 }
1686
1687 /**
1688 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1689 *
1690 * @param integer Phash integer to test.
1691 * @return void
1692 */
1693 function is_grlist_set($phash_x) {
1694 return $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
1695 'phash_x',
1696 'index_grlist',
1697 'phash_x=' . intval($phash_x)
1698 );
1699 }
1700
1701 /**
1702 * Check if an grlist-entry for this hash exists and if not so, write one.
1703 *
1704 * @param integer phash of the search result that should be found
1705 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1706 * @return void
1707 * @see submit_grlist()
1708 */
1709 function update_grlist($phash,$phash_x) {
1710 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
1711 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1712 $this->submit_grlist($phash,$phash_x);
1713 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
1714 }
1715 }
1716
1717 /**
1718 * Update tstamp for a phash row.
1719 *
1720 * @param integer phash value
1721 * @param integer If set, update the mtime field to this value.
1722 * @return void
1723 */
1724 function updateTstamp($phash,$mtime=0) {
1725 $updateFields = array(
1726 'tstamp' => time()
1727 );
1728 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
1729
1730 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1731 }
1732
1733 /**
1734 * Update SetID of the index_phash record.
1735 *
1736 * @param integer phash value
1737 * @return void
1738 */
1739 function updateSetId($phash) {
1740 $updateFields = array(
1741 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1742 );
1743
1744 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1745 }
1746
1747 /**
1748 * Update parsetime for phash row.
1749 *
1750 * @param integer phash value.
1751 * @param integer Parsetime value to set.
1752 * @return void
1753 */
1754 function updateParsetime($phash,$parsetime) {
1755 $updateFields = array(
1756 'parsetime' => intval($parsetime)
1757 );
1758
1759 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1760 }
1761
1762 /**
1763 * Update section rootline for the page
1764 *
1765 * @return void
1766 */
1767 function updateRootline() {
1768
1769 $updateFields = array();
1770 $this->getRootLineFields($updateFields);
1771
1772 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
1773 }
1774
1775 /**
1776 * Adding values for root-line fields.
1777 * rl0, rl1 and rl2 are standard. A hook might add more.
1778 *
1779 * @param array Field array, passed by reference
1780 * @return void
1781 */
1782 function getRootLineFields(&$fieldArr) {
1783
1784 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
1785 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
1786 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
1787
1788 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1789 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1790 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1791 }
1792 }
1793 }
1794
1795 /**
1796 * Removes any indexed pages with userlogins which has the same contentHash
1797 * NOT USED anywhere inside this class!
1798 *
1799 * @return void
1800 */
1801 function removeLoginpagesWithContentHash() {
1802 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
1803 A.phash=B.phash
1804 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
1805 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
1806 AND A.contentHash='.intval($this->content_md5h));
1807 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1808 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
1809 $this->removeOldIndexedPages($row['phash']);
1810 }
1811 }
1812
1813 /**
1814 * Includes the crawler class
1815 *
1816 * @return void
1817 */
1818 function includeCrawlerClass() {
1819 global $TYPO3_CONF_VARS;
1820
1821 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
1822 }
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833 /********************************
1834 *
1835 * SQL; Submitting words
1836 *
1837 *******************************/
1838
1839 /**
1840 * Adds new words to db
1841 *
1842 * @param array Word List array (where each word has information about position etc).
1843 * @return void
1844 */
1845 function checkWordList($wl) {
1846 reset($wl);
1847 $phashArr = array();
1848 while(list($key,) = each($wl)) {
1849 $phashArr[] = $wl[$key]['hash'];
1850 }
1851 if (count($phashArr)) {
1852 $cwl = implode(',',$phashArr);
1853 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
1854
1855 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
1856 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
1857 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1858 unset($wl[$row['baseword']]);
1859 }
1860
1861 reset($wl);
1862 while(list($key,$val)=each($wl)) {
1863 $insertFields = array(
1864 'wid' => $val['hash'],
1865 'baseword' => $key,
1866 'metaphone' => $val['metaphone']
1867 );
1868 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1869 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1870 }
1871 }
1872 }
1873 }
1874
1875 /**
1876 * Submits RELATIONS between words and phash
1877 *
1878 * @param array Word list array
1879 * @param integer phash value
1880 * @return void
1881 */
1882 function submitWords($wl,$phash) {
1883 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
1884
1885 foreach($wl as $val) {
1886 $insertFields = array(
1887 'phash' => $phash,
1888 'wid' => $val['hash'],
1889 'count' => $val['count'],
1890 'first' => $val['first'],
1891 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
1892 'flags' => ($val['cmp'] & $this->flagBitMask)
1893 );
1894
1895 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
1896 }
1897 }
1898
1899 /**
1900 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1901 * and back.
1902 *
1903 * @param double Frequency
1904 * @return integer Frequency in range.
1905 */
1906 function freqMap($freq) {
1907 $mapFactor = $this->freqMax*100*$this->freqRange;
1908 if($freq<1) {
1909 $newFreq = $freq*$mapFactor;
1910 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
1911 } else {
1912 $newFreq = $freq/$mapFactor;
1913 }
1914 return $newFreq;
1915
1916 }
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928 /********************************
1929 *
1930 * Hashing
1931 *
1932 *******************************/
1933
1934 /**
1935 * Get search hash, T3 pages
1936 *
1937 * @return void
1938 */
1939 function setT3Hashes() {
1940
1941 // Set main array:
1942 $hArray = array(
1943 'id' => (integer)$this->conf['id'],
1944 'type' => (integer)$this->conf['type'],
1945 'sys_lang' => (integer)$this->conf['sys_language_uid'],
1946 'MP' => (string)$this->conf['MP'],
1947 'cHash' => $this->cHashParams
1948 );
1949
1950 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1951 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1952
1953 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1954 $hArray['gr_list'] = (string)$this->conf['gr_list'];
1955 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
1956 }
1957
1958 /**
1959 * Get search hash, external files
1960 *
1961 * @param string File name / path which identifies it on the server
1962 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
1963 * @return array Array with "phash_grouping" and "phash" inside.
1964 */
1965 function setExtHashes($file,$subinfo=array()) {
1966 // Set main array:
1967 $hash = array();
1968 $hArray = array(
1969 'file' => $file,
1970 );
1971
1972 // Set grouping hash:
1973 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1974
1975 // Add subinfo
1976 $hArray['subinfo'] = $subinfo;
1977 $hash['phash'] = $this->md5inthash(serialize($hArray));
1978
1979 return $hash;
1980 }
1981
1982 /**
1983 * md5 integer hash
1984 * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
1985 *
1986 * @param string String to hash
1987 * @return integer Integer intepretation of the md5 hash of input string.
1988 */
1989 function md5inthash($str) {
1990 return hexdec(substr(md5($str),0,7));
1991 }
1992
1993 /**
1994 * Calculates the cHash value of input GET array (for constructing cHash values if needed)
1995 *
1996 * @param array Array of GET parameters to encode
1997 * @return void
1998 */
1999 function makeCHash($paramArray) {
2000 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
2001
2002 $pA = t3lib_div::cHashParams($addQueryParams);
2003
2004 return t3lib_div::shortMD5(serialize($pA));
2005 }
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018 /*********************************
2019 *
2020 * Internal logging functions
2021 *
2022 *********************************/
2023
2024 /**
2025 * Push function wrapper for TT logging
2026 *
2027 * @param string Title to set
2028 * @param string Key (?)
2029 * @return void
2030 */
2031 function log_push($msg,$key) {
2032 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
2033 }
2034
2035 /**
2036 * Pull function wrapper for TT logging
2037 *
2038 * @return void
2039 */
2040 function log_pull() {
2041 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
2042 }
2043
2044 /**
2045 * Set log message function wrapper for TT logging
2046 *
2047 * @param string Message to set
2048 * @param integer Error number
2049 * @return void
2050 */
2051 function log_setTSlogMessage($msg, $errorNum=0) {
2052 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
2053 $this->internal_log[] = $msg;
2054 }
2055
2056
2057
2058
2059
2060
2061
2062
2063 /**************************
2064 *
2065 * tslib_fe hooks:
2066 *
2067 **************************/
2068
2069 /**
2070 * Frontend hook: If the page is not being re-generated this is our chance to force it to be (because re-generation of the page is required in order to have the indexer called!)
2071 *
2072 * @param array Parameters from frontend
2073 * @param object TSFE object (reference under PHP5)
2074 * @return void
2075 * @deprecated since TYPO3 4.3 - the method was extracted to hooks/class.tx_indexedsearch_tslib_fe_hook.php
2076 */
2077 function fe_headerNoCache(&$params, $ref) {
2078 require_once t3lib_extMgm::extPath('indexed_search') . 'hooks/class.tx_indexedsearch_tslib_fe_hook.php';
2079 t3lib_div::makeInstance('tx_indexedsearch_tslib_fe_hook')->headerNoCache($params, $ref);
2080 }
2081 }
2082
2083
2084 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
2085 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
2086 }
2087 ?>