Feature request #5748: Introduced a new TS option that makes indexing of metatags...
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 141: class tx_indexedsearch_indexer
39 * 207: function hook_indexContent(&$pObj)
40 *
41 * SECTION: Backend API
42 * 308: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
43 * 347: function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
44 * 365: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
45 *
46 * SECTION: Initialization
47 * 416: function init()
48 * 468: function initializeExternalParsers()
49 *
50 * SECTION: Indexing; TYPO3 pages (HTML content)
51 * 509: function indexTypo3PageContent()
52 * 596: function splitHTMLContent($content)
53 * 642: function getHTMLcharset($content)
54 * 657: function convertHTMLToUtf8($content,$charset='')
55 * 685: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
56 * 712: function typoSearchTags(&$body)
57 * 741: function extractLinks($content)
58 * 812: function extractHyperLinks($string)
59 *
60 * SECTION: Indexing; external URL
61 * 871: function indexExternalUrl($externalUrl)
62 * 902: function getUrlHeaders($url)
63 *
64 * SECTION: Indexing; external files (PDF, DOC, etc)
65 * 948: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
66 * 1054: function readFileContent($ext,$absFile,$cPKey)
67 * 1071: function fileContentParts($ext,$absFile)
68 * 1089: function splitRegularContent($content)
69 *
70 * SECTION: Analysing content, Extracting words
71 * 1122: function charsetEntity2utf8(&$contentArr, $charset)
72 * 1145: function processWordsInArrays($contentArr)
73 * 1170: function procesWordsInArrays($contentArr)
74 * 1180: function bodyDescription($contentArr)
75 * 1202: function indexAnalyze($content)
76 * 1223: function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
77 * 1242: function analyzeBody(&$retArr,$content)
78 * 1262: function metaphone($word,$retRaw=FALSE)
79 *
80 * SECTION: SQL; TYPO3 Pages
81 * 1304: function submitPage()
82 * 1378: function submit_grlist($hash,$phash_x)
83 * 1398: function submit_section($hash,$hash_t3)
84 * 1416: function removeOldIndexedPages($phash)
85 *
86 * SECTION: SQL; External media
87 * 1459: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
88 * 1525: function submitFile_grlist($hash)
89 * 1539: function submitFile_section($hash)
90 * 1553: function removeOldIndexedFiles($phash)
91 *
92 * SECTION: SQL Helper functions
93 * 1589: function checkMtimeTstamp($mtime,$phash)
94 * 1625: function checkContentHash()
95 * 1642: function checkExternalDocContentHash($hashGr,$content_md5h)
96 * 1656: function is_grlist_set($phash_x)
97 * 1669: function update_grlist($phash,$phash_x)
98 * 1684: function updateTstamp($phash,$mtime=0)
99 * 1699: function updateSetId($phash)
100 * 1714: function updateParsetime($phash,$parsetime)
101 * 1727: function updateRootline()
102 * 1742: function getRootLineFields(&$fieldArr)
103 * 1761: function removeLoginpagesWithContentHash()
104 * 1778: function includeCrawlerClass()
105 *
106 * SECTION: SQL; Submitting words
107 * 1805: function checkWordList($wl)
108 * 1842: function submitWords($wl,$phash)
109 * 1866: function freqMap($freq)
110 *
111 * SECTION: Hashing
112 * 1899: function setT3Hashes()
113 * 1925: function setExtHashes($file,$subinfo=array())
114 * 1949: function md5inthash($str)
115 * 1959: function makeCHash($paramArray)
116 *
117 * SECTION: Internal logging functions
118 * 1991: function log_push($msg,$key)
119 * 2000: function log_pull()
120 * 2011: function log_setTSlogMessage($msg, $errorNum=0)
121 *
122 * SECTION: tslib_fe hooks:
123 * 2036: function fe_headerNoCache(&$params, $ref)
124 *
125 * TOTAL FUNCTIONS: 59
126 * (This index is automatically created/updated by the extension "extdeveval")
127 *
128 */
129
130
131 require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
132
133
134 /**
135 * Indexing class for TYPO3 frontend
136 *
137 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
138 * @package TYPO3
139 * @subpackage tx_indexedsearch
140 */
141 class tx_indexedsearch_indexer {
142
143 // Messages:
144 var $reasons = array(
145 -1 => 'mtime matched the document, so no changes detected and no content updated',
146 -2 => 'The minimum age was not exceeded',
147 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
148 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
149 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
150 4 => 'Page has never been indexed (is not represented in the index_phash table).'
151 );
152
153 // HTML code blocks to exclude from indexing:
154 var $excludeSections = 'script,style';
155
156 // Supported Extensions for external files:
157 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
158
159 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
160 var $defaultGrList = '0,-1';
161
162 // Min/Max times:
163 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
164 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
165 var $maxExternalFiles = 0; // Max number of external files to index.
166
167 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc.
168 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
169
170 // INTERNALS:
171 var $defaultContentArray=array(
172 'title' => '',
173 'description' => '',
174 'keywords' => '',
175 'body' => '',
176 );
177 var $wordcount = 0;
178 var $externalFileCounter = 0;
179
180 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
181 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
182 var $hash = array(); // Hash array, contains phash and phash_grouping
183 var $file_phash_arr = array(); // Hash array for files
184 var $contentParts = array(); // Content of TYPO3 page
185 var $content_md5h = '';
186 var $internal_log = array(); // Internal log
187 var $indexExternalUrl_content = '';
188
189 var $cHashParams = array(); // cHashparams array
190
191 var $freqRange = 32000;
192 var $freqMax = 0.1;
193
194 // Objects:
195 /**
196 * Charset class object
197 *
198 * @var t3lib_cs
199 */
200 var $csObj;
201
202 /**
203 * Metaphone object, if any
204 *
205 * @var user_DoubleMetaPhone
206 */
207 var $metaphoneObj;
208
209 /**
210 * Lexer object for word splitting
211 *
212 * @var tx_indexedsearch_lexer
213 */
214 var $lexerObj;
215
216
217
218 /**
219 * Parent Object (TSFE) Initialization
220 *
221 * @param object Parent Object (frontend TSFE object), passed by reference
222 * @return void
223 */
224 function hook_indexContent(&$pObj) {
225
226 // Indexer configuration from Extension Manager interface:
227 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
228
229 // Crawler activation:
230 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
231 if (t3lib_extMgm::isLoaded('crawler')
232 && $pObj->applicationData['tx_crawler']['running']
233 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
234
235 // Setting simple log message:
236 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
237
238 // Setting variables:
239 $this->crawlerActive = TRUE; // Crawler active flag
240 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
241 }
242
243 // Determine if page should be indexed, and if so, configure and initialize indexer
244 if ($pObj->config['config']['index_enable']) {
245 $this->log_push('Index page','');
246
247 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
248 if (!$pObj->page['no_search']) {
249 if (!$pObj->no_cache) {
250 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
251
252 // Setting up internal configuration from config array:
253 $this->conf = array();
254
255 // Information about page for which the indexing takes place
256 $this->conf['id'] = $pObj->id; // Page id
257 $this->conf['type'] = $pObj->type; // Page type
258 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
259 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
260 $this->conf['gr_list'] = $pObj->gr_list; // Group list
261
262 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
263 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
264
265 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
266 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
267
268 // Root line uids
269 $this->conf['rootline_uids'] = array();
270 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
271 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
272 }
273
274 // Content of page:
275 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
276 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
277 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
278 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
279
280 // Configuration of behavior:
281 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
282 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
283 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
284
285 // Set to zero:
286 $this->conf['recordUid'] = 0;
287 $this->conf['freeIndexUid'] = 0;
288 $this->conf['freeIndexSetId'] = 0;
289
290 // Init and start indexing:
291 $this->init();
292 $this->indexTypo3PageContent();
293 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
294 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
295 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
296 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
297 $this->log_pull();
298 }
299 }
300
301
302
303
304
305
306
307
308 /****************************
309 *
310 * Backend API
311 *
312 ****************************/
313
314 /**
315 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
316 *
317 * @param integer The page uid, &id=
318 * @param integer The page type, &type=
319 * @param integer sys_language uid, typically &L=
320 * @param string The MP variable (Mount Points), &MP=
321 * @param array Rootline array of only UIDs.
322 * @param array Array of GET variables to register with this indexing
323 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
324 * @return void
325 */
326 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
327
328 // Setting up internal configuration from config array:
329 $this->conf = array();
330
331 // Information about page for which the indexing takes place
332 $this->conf['id'] = $id; // Page id (integer)
333 $this->conf['type'] = $type; // Page type (integer)
334 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
335 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
336 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
337
338 // cHash values:
339 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : ''; // cHash string for additional parameters
340 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
341
342 // Set to defaults
343 $this->conf['freeIndexUid'] = 0;
344 $this->conf['freeIndexSetId'] = 0;
345 $this->conf['page_cache_reg1'] = '';
346
347 // Root line uids
348 $this->conf['rootline_uids'] = $uidRL;
349
350 // Configuration of behavior:
351 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
352 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
353
354 // Init and start indexing:
355 $this->init();
356 }
357
358 /**
359 * Sets the free-index uid. Can be called right after backend_initIndexer()
360 *
361 * @param integer Free index UID
362 * @param integer Set id - an integer identifying the "set" of indexing operations.
363 * @return void
364 */
365 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
366 $this->conf['freeIndexUid'] = $freeIndexUid;
367 $this->conf['freeIndexSetId'] = $freeIndexSetId;
368 }
369
370 /**
371 * Indexing records as the content of a TYPO3 page.
372 *
373 * @param string Title equivalent
374 * @param string Keywords equivalent
375 * @param string Description equivalent
376 * @param string The main content to index
377 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
378 * @param integer Last modification time, in seconds
379 * @param integer The creation date of the content, in seconds
380 * @param integer The record UID that the content comes from (for registration with the indexed rows)
381 * @return void
382 */
383 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
384
385 // Content of page:
386 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
387 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
388 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
389
390 // Construct fake HTML for parsing:
391 $this->conf['content'] = '
392 <html>
393 <head>
394 <title>'.htmlspecialchars($title).'</title>
395 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
396 <meta name="description" content="'.htmlspecialchars($description).'" />
397 </head>
398 <body>
399 '.htmlspecialchars($content).'
400 </body>
401 </html>'; // Content string (HTML of TYPO3 page)
402
403 // Initializing charset:
404 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
405 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
406
407 // Index content as if it was a TYPO3 page:
408 $this->indexTypo3PageContent();
409 }
410
411
412
413
414
415
416
417
418
419
420
421
422
423 /********************************
424 *
425 * Initialization
426 *
427 *******************************/
428
429 /**
430 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
431 *
432 * @return void
433 */
434 function init() {
435 global $TYPO3_CONF_VARS;
436
437 // Initializing:
438 $this->cHashParams = $this->conf['cHash_array'];
439 if (is_array($this->cHashParams) && count($this->cHashParams)) {
440 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
441 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
442 }
443
444 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
445 $this->setT3Hashes();
446
447 // Indexer configuration from Extension Manager interface:
448 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
449 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
450 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
451 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
452 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
453
454 // Initialize external document parsers:
455 // Example configuration, see ext_localconf.php of this file!
456 if ($this->conf['index_externals']) {
457 $this->initializeExternalParsers();
458 }
459
460 // Initialize lexer (class that deconstructs the text into words):
461 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
462 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
463 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
464 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
465 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
466 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
467
468 // Initialize metaphone hook:
469 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
470 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
471 $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
472 $this->metaphoneObj->pObj = &$this;
473 }
474
475 // Init charset class:
476 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
477 }
478
479 /**
480 * Initialize external parsers
481 *
482 * @return void
483 * @access private
484 * @see init()
485 */
486 function initializeExternalParsers() {
487 global $TYPO3_CONF_VARS;
488
489 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
490 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
491 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
492 $this->external_parsers[$extension]->pObj = &$this;
493
494 // Init parser and if it returns false, unset its entry again:
495 if (!$this->external_parsers[$extension]->initParser($extension)) {
496 unset($this->external_parsers[$extension]);
497 }
498 }
499 }
500 }
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 /********************************
517 *
518 * Indexing; TYPO3 pages (HTML content)
519 *
520 *******************************/
521
522 /**
523 * Start indexing of the TYPO3 page
524 *
525 * @return void
526 */
527 function indexTypo3PageContent() {
528
529 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
530 $is_grlist = $this->is_grlist_set($this->hash['phash']);
531
532 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
533
534 // Setting message:
535 if ($this->forceIndexing) {
536 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
537 } elseif ($check > 0) {
538 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
539 } else {
540 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
541 }
542
543 // Divide into title,keywords,description and body:
544 $this->log_push('Split content','');
545 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
546 if ($this->conf['indexedDocTitle']) {
547 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
548 }
549 $this->log_pull();
550
551 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
552 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
553
554 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
555 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
556 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
557 $checkCHash = $this->checkContentHash();
558 if (!is_array($checkCHash) || $check===1) {
559 $Pstart=t3lib_div::milliseconds();
560
561 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
562 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
563 $this->log_pull();
564
565 // Splitting words
566 $this->log_push('Extract words from content','');
567 $splitInWords = $this->processWordsInArrays($this->contentParts);
568 $this->log_pull();
569
570 // Analyse the indexed words.
571 $this->log_push('Analyse the extracted words','');
572 $indexArr = $this->indexAnalyze($splitInWords);
573 $this->log_pull();
574
575 // Submitting page (phash) record
576 $this->log_push('Submitting page','');
577 $this->submitPage();
578 $this->log_pull();
579
580 // Check words and submit to word list if not there
581 $this->log_push('Check word list and submit words','');
582 $this->checkWordList($indexArr);
583 $this->submitWords($indexArr,$this->hash['phash']);
584 $this->log_pull();
585
586 // Set parsetime
587 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
588
589 // Checking external files if configured for.
590 $this->log_push('Checking external files','');
591 if ($this->conf['index_externals']) {
592 $this->extractLinks($this->conf['content']);
593 }
594 $this->log_pull();
595 } else {
596 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
597 $this->updateSetId($this->hash['phash']);
598 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
599 $this->updateRootline();
600 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
601 }
602 } else {
603 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
604 }
605 }
606
607 /**
608 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
609 *
610 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
611 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
612 * @see splitRegularContent()
613 */
614 function splitHTMLContent($content) {
615
616 // divide head from body ( u-ouh :) )
617 $contentArr = $this->defaultContentArray;
618 $contentArr['body'] = stristr($content,'<body');
619 $headPart = substr($content,0,-strlen($contentArr['body']));
620
621 // get title
622 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
623 $titleParts = explode(':',$contentArr['title'],2);
624 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
625
626 // get keywords and description metatags
627 if($this->conf['index_metatags']) {
628 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
629 for($i=0;isset($meta[$i]);$i++) {
630 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
631 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
632 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
633 }
634 }
635
636 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
637 $this->typoSearchTags($contentArr['body']);
638
639 // Get rid of unwanted sections (ie. scripting and style stuff) in body
640 $tagList = explode(',',$this->excludeSections);
641 foreach($tagList as $tag) {
642 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
643 }
644
645 // remove tags, but first make sure we don't concatenate words by doing it
646 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
647 $contentArr['body'] = trim(strip_tags($contentArr['body']));
648
649 $contentArr['keywords'] = trim($contentArr['keywords']);
650 $contentArr['description'] = trim($contentArr['description']);
651
652 // Return array
653 return $contentArr;
654 }
655
656 /**
657 * Extract the charset value from HTML meta tag.
658 *
659 * @param string HTML content
660 * @return string The charset value if found.
661 */
662 function getHTMLcharset($content) {
663 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg)) {
664 if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2)) {
665 return $reg2[1];
666 }
667 }
668 }
669
670 /**
671 * Converts a HTML document to utf-8
672 *
673 * @param string HTML content, any charset
674 * @param string Optional charset (otherwise extracted from HTML)
675 * @return string Converted HTML
676 */
677 function convertHTMLToUtf8($content,$charset='') {
678
679 // Find charset:
680 $charset = $charset ? $charset : $this->getHTMLcharset($content);
681 $charset = $this->csObj->parse_charset($charset);
682
683 // Convert charset:
684 if ($charset && $charset!=='utf-8') {
685 $content = $this->csObj->utf8_encode($content, $charset);
686 }
687 // Convert entities, assuming document is now UTF-8:
688 $content = $this->csObj->entities_to_utf8($content, TRUE);
689
690 return $content;
691 }
692
693 /**
694 * Finds first occurence of embracing tags and returns the embraced content and the original string with
695 * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
696 * <title> of document or removing <script>-sections
697 *
698 * @param string String to search in
699 * @param string Tag name, eg. "script"
700 * @param string Passed by reference: Content inside found tag
701 * @param string Passed by reference: Content after found tag
702 * @param string Passed by reference: Attributes of the found tag.
703 * @return boolean Returns false if tag was not found, otherwise true.
704 */
705 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
706 $endTag = '</'.$tagName.'>';
707 $startTag = '<'.$tagName;
708
709 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
710 if(!$isTagInText) return false; // if the tag was not found, return false
711
712 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
713 $afterTagInText = stristr($isTagInText,$endTag);
714 if ($afterTagInText) {
715 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
716 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
717 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
718 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
719 $tagContent='';
720 $stringAfter = $isTagInText;
721 }
722
723 return true;
724 }
725
726 /**
727 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
728 *
729 * @param string HTML Content, passed by reference
730 * @return boolean Returns true if a TYPOSEARCH_ tag was found, otherwise false.
731 */
732 function typoSearchTags(&$body) {
733 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
734
735 if(count($expBody)>1) {
736 $body = '';
737
738 foreach($expBody as $val) {
739 $part = explode('-->',$val,2);
740 if(trim($part[0])=='begin') {
741 $body.= $part[1];
742 $prev = '';
743 } elseif(trim($part[0])=='end') {
744 $body.= $prev;
745 } else {
746 $prev = $val;
747 }
748 }
749 return true;
750 } else {
751 return false;
752 }
753 }
754
755 /**
756 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
757 *
758 * @param string HTML content
759 * @return void
760 */
761 function extractLinks($content) {
762
763 // Get links:
764 $list = $this->extractHyperLinks($content);
765
766 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
767 $this->includeCrawlerClass();
768 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
769 }
770
771 // Traverse links:
772 foreach($list as $linkInfo) {
773
774 // Decode entities:
775 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
776 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
777 } else {
778 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
779 }
780
781 // Parse URL:
782 $qParts = parse_url($linkSource);
783
784 // Check for jumpurl (TYPO3 specific thing...)
785 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
786 parse_str($qParts['query'],$getP);
787 $linkSource = $getP['jumpurl'];
788 $qParts = parse_url($linkSource); // parse again due to new linkSource!
789 }
790
791 if ($qParts['scheme']) {
792 if ($this->indexerConfig['indexExternalURLs']) {
793 // Index external URL (http or otherwise)
794 $this->indexExternalUrl($linkSource);
795 }
796 } elseif (!$qParts['query']) {
797 if (t3lib_div::isAllowedAbsPath($linkSource)) {
798 $localFile = $linkSource;
799 } else {
800 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
801 }
802 if ($localFile && @is_file($localFile)) {
803
804 // Index local file:
805 if ($linkInfo['localPath']) {
806
807 $fI = pathinfo($linkSource);
808 $ext = strtolower($fI['extension']);
809 if (is_object($crawler)) {
810 $params = array(
811 'document' => $linkSource,
812 'alturl' => $linkInfo['href'],
813 'conf' => $this->conf
814 );
815 unset($params['conf']['content']);
816
817 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
818 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
819 } else {
820 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
821 }
822 } else {
823 if (is_object($crawler)) {
824 $params = array(
825 'document' => $linkSource,
826 'conf' => $this->conf
827 );
828 unset($params['conf']['content']);
829 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
830 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
831 } else {
832 $this->indexRegularDocument($linkSource);
833 }
834 }
835 }
836 }
837 }
838 }
839
840 /**
841 * Extracts all links to external documents from content string.
842 *
843 * @param string Content to analyse
844 * @return array Array of hyperlinks
845 * @see extractLinks()
846 */
847 function extractHyperLinks($string) {
848 if (!is_object($this->htmlParser)) {
849 $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
850 }
851
852 $parts = $this->htmlParser->splitTags('a',$string);
853 $list = array();
854 foreach ($parts as $k => $v) {
855 if ($k%2) {
856 $params = $this->htmlParser->get_tag_attributes($v,1);
857 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
858
859 switch (strtolower($firstTagName)) {
860 case 'a':
861 $src = $params[0]['href'];
862 if ($src) {
863 // Check if a local path to that file has been set - useful if you are using a download script.
864 $md5 = t3lib_div::shortMD5($src);
865 if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'])) {
866 $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';
867 } else $localPath=false;
868
869 $list[] = array(
870 'tag' => $v,
871 'href' => $params[0]['href'],
872 'localPath' => $localPath
873 );
874 }
875 break;
876 }
877 }
878 }
879
880 return $list;
881 }
882
883
884
885
886
887
888
889
890
891
892
893 /******************************************
894 *
895 * Indexing; external URL
896 *
897 ******************************************/
898
899 /**
900 * Index External URLs HTML content
901 *
902 * @param string URL, eg. "http://typo3.org/"
903 * @return void
904 * @see indexRegularDocument()
905 */
906 function indexExternalUrl($externalUrl) {
907
908 // Parse External URL:
909 $qParts = parse_url($externalUrl);
910 $fI = pathinfo($qParts['path']);
911 $ext = strtolower($fI['extension']);
912
913 // Get headers:
914 $urlHeaders = $this->getUrlHeaders($externalUrl);
915 if (stristr($urlHeaders['Content-Type'],'text/html')) {
916 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
917 if (strlen($content)) {
918
919 // Create temporary file:
920 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
921 t3lib_div::writeFile($tmpFile, $content);
922
923 // Index that file:
924 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
925 unlink($tmpFile);
926 }
927 }
928 }
929
930 /**
931 * Getting HTTP request headers of URL
932 *
933 * @param string The URL
934 * @param integer Timeout (seconds?)
935 * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys
936 */
937 function getUrlHeaders($url) {
938 $content = t3lib_div::getURL($url,2); // Try to get the headers only
939
940 if (strlen($content)) {
941 // Compile headers:
942 $headers = t3lib_div::trimExplode(chr(10),$content,1);
943 $retVal = array();
944 foreach($headers as $line) {
945 if (!strlen(trim($line))) {
946 break; // Stop at the first empty line (= end of header)
947 }
948
949 list($headKey, $headValue) = explode(':', $line, 2);
950 $retVal[$headKey] = $headValue;
951 }
952 return $retVal;
953 }
954 }
955
956
957
958
959
960
961
962
963
964
965
966
967
968 /******************************************
969 *
970 * Indexing; external files (PDF, DOC, etc)
971 *
972 ******************************************/
973
974 /**
975 * Indexing a regular document given as $file (relative to PATH_site, local file)
976 *
977 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
978 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
979 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
980 * @param string File extension for temporary file.
981 * @return void
982 */
983 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
984
985 // Init
986 $fI = pathinfo($file);
987 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
988
989 // Create abs-path:
990 if (!$contentTmpFile) {
991 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
992 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
993 } else { // Absolute, pass-through:
994 $absFile = $file;
995 }
996 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
997 } else {
998 $absFile = $contentTmpFile;
999 }
1000
1001 // Indexing the document:
1002 if ($absFile && @is_file($absFile)) {
1003 if ($this->external_parsers[$ext]) {
1004 $mtime = filemtime($absFile);
1005 $cParts = $this->fileContentParts($ext,$absFile);
1006
1007 foreach($cParts as $cPKey) {
1008 $this->internal_log = array();
1009 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
1010 $Pstart = t3lib_div::milliseconds();
1011 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1012 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
1013 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1014 if ($check > 0 || $force) {
1015 if ($check > 0) {
1016 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
1017 } else {
1018 $this->log_setTSlogMessage('Indexing forced by flag',1);
1019 }
1020
1021 // Check external file counter:
1022 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1023
1024 // Divide into title,keywords,description and body:
1025 $this->log_push('Split content','');
1026 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
1027 $this->log_pull();
1028
1029 if (is_array($contentParts)) {
1030 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1031 $content_md5h = $this->md5inthash(implode($contentParts,''));
1032
1033 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1034
1035 // Increment counter:
1036 $this->externalFileCounter++;
1037
1038 // Splitting words
1039 $this->log_push('Extract words from content','');
1040 $splitInWords = $this->processWordsInArrays($contentParts);
1041 $this->log_pull();
1042
1043 // Analyse the indexed words.
1044 $this->log_push('Analyse the extracted words','');
1045 $indexArr = $this->indexAnalyze($splitInWords);
1046 $this->log_pull();
1047
1048 // Submitting page (phash) record
1049 $this->log_push('Submitting page','');
1050 $size = filesize($absFile);
1051 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1052 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
1053 $this->log_pull();
1054
1055 // Check words and submit to word list if not there
1056 $this->log_push('Check word list and submit words','');
1057 $this->checkWordList($indexArr);
1058 $this->submitWords($indexArr,$phash_arr['phash']);
1059 $this->log_pull();
1060
1061 // Set parsetime
1062 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
1063 } else {
1064 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
1065 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
1066 }
1067 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1068 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1069 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1070
1071 // Checking and setting sections:
1072 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
1073 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1074 $this->log_pull();
1075 }
1076 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1077 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1078 }
1079
1080 /**
1081 * Reads the content of an external file being indexed.
1082 * The content from the external parser MUST be returned in utf-8!
1083 *
1084 * @param string File extension, eg. "pdf", "doc" etc.
1085 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1086 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1087 * @return array Standard content array (title, description, keywords, body keys)
1088 */
1089 function readFileContent($ext,$absFile,$cPKey) {
1090
1091 // Consult relevant external document parser:
1092 if (is_object($this->external_parsers[$ext])) {
1093 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
1094 }
1095
1096 return $contentArr;
1097 }
1098
1099 /**
1100 * Creates an array with pointers to divisions of document.
1101 *
1102 * @param string File extension
1103 * @param string Absolute filename (must exist and be validated OK before calling function)
1104 * @return array Array of pointers to sections that the document should be divided into
1105 */
1106 function fileContentParts($ext,$absFile) {
1107 $cParts = array(0);
1108
1109 // Consult relevant external document parser:
1110 if (is_object($this->external_parsers[$ext])) {
1111 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
1112 }
1113
1114 return $cParts;
1115 }
1116
1117 /**
1118 * Splits non-HTML content (from external files for instance)
1119 *
1120 * @param string Input content (non-HTML) to index.
1121 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1122 * @see splitHTMLContent()
1123 */
1124 function splitRegularContent($content) {
1125 $contentArr = $this->defaultContentArray;
1126 $contentArr['body'] = $content;
1127
1128 return $contentArr;
1129 }
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144 /**********************************
1145 *
1146 * Analysing content, Extracting words
1147 *
1148 **********************************/
1149
1150 /**
1151 * Convert character set and HTML entities in the value of input content array keys
1152 *
1153 * @param array Standard content array
1154 * @param string Charset of the input content (converted to utf-8)
1155 * @return void
1156 */
1157 function charsetEntity2utf8(&$contentArr, $charset) {
1158
1159 // Convert charset if necessary
1160 reset($contentArr);
1161 while(list($key,)=each($contentArr)) {
1162 if (strlen($contentArr[$key])) {
1163
1164 if ($charset!=='utf-8') {
1165 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1166 }
1167
1168 // decode all numeric / html-entities in the string to real characters:
1169 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
1170 }
1171 }
1172 }
1173
1174 /**
1175 * Processing words in the array from split*Content -functions
1176 *
1177 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1178 * @return array Content input array modified so each key is not a unique array of words
1179 */
1180 function processWordsInArrays($contentArr) {
1181
1182 // split all parts to words
1183 reset($contentArr);
1184 while(list($key,)=each($contentArr)) {
1185 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1186 }
1187
1188 // For title, keywords, and description we don't want duplicates:
1189 $contentArr['title'] = array_unique($contentArr['title']);
1190 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1191 $contentArr['description'] = array_unique($contentArr['description']);
1192
1193 // Return modified array:
1194 return $contentArr;
1195 }
1196
1197 /**
1198 * Processing words in the array from split*Content -functions
1199 * This function is only a wrapper because the function has been removed (see above).
1200 *
1201 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1202 * @return array Content input array modified so each key is not a unique array of words
1203 * @deprecated
1204 */
1205 function procesWordsInArrays($contentArr) {
1206 return $this->processWordsInArrays($contentArr);
1207 }
1208
1209 /**
1210 * Extracts the sample description text from the content array.
1211 *
1212 * @param array Content array
1213 * @return string Description string
1214 */
1215 function bodyDescription($contentArr) {
1216
1217 // Setting description
1218 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
1219 if ($maxL) {
1220 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
1221 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
1222 $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
1223
1224 // Shorten the string:
1225 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1226 }
1227
1228 return $bodyDescription;
1229 }
1230
1231 /**
1232 * Analyzes content to use for indexing,
1233 *
1234 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1235 * @return array Index Array (whatever that is...)
1236 */
1237 function indexAnalyze($content) {
1238 $indexArr = Array();
1239 $counter = 0;
1240
1241 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
1242 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
1243 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
1244 $this->analyzeBody($indexArr,$content);
1245
1246 return ($indexArr);
1247 }
1248
1249 /**
1250 * Calculates relevant information for headercontent
1251 *
1252 * @param array Index array, passed by reference
1253 * @param array Standard content array
1254 * @param string Key from standard content array
1255 * @param integer Bit-wise priority to type
1256 * @return void
1257 */
1258 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
1259 reset($content[$key]);
1260 while(list(,$val)=each($content[$key])) {
1261 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1262 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
1263 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1264 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1265 $retArr[$val]['metaphone'] = $this->metaphone($val);
1266 $this->wordcount++;
1267 }
1268 }
1269
1270 /**
1271 * Calculates relevant information for bodycontent
1272 *
1273 * @param array Index array, passed by reference
1274 * @param array Standard content array
1275 * @return void
1276 */
1277 function analyzeBody(&$retArr,$content) {
1278 foreach($content['body'] as $key => $val) {
1279 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1280 if(!isset($retArr[$val])) {
1281 $retArr[$val]['first'] = $key;
1282 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1283 $retArr[$val]['metaphone'] = $this->metaphone($val);
1284 }
1285 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1286 $this->wordcount++;
1287 }
1288 }
1289
1290 /**
1291 * Creating metaphone based hash from input word
1292 *
1293 * @param string Word to convert
1294 * @param boolean If set, returns the raw metaphone value (not hashed)
1295 * @return mixed Metaphone hash integer (or raw value, string)
1296 */
1297 function metaphone($word,$retRaw=FALSE) {
1298
1299 if (is_object($this->metaphoneObj)) {
1300 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1301 } else {
1302 $tmp = metaphone($word);
1303 }
1304
1305 // Return raw value?
1306 if ($retRaw) return $tmp;
1307
1308 // Otherwise create hash and return integer
1309 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
1310 return $ret;
1311 }
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328 /********************************
1329 *
1330 * SQL; TYPO3 Pages
1331 *
1332 *******************************/
1333
1334 /**
1335 * Updates db with information about the page (TYPO3 page, not external media)
1336 *
1337 * @return void
1338 */
1339 function submitPage() {
1340
1341 // Remove any current data for this phash:
1342 $this->removeOldIndexedPages($this->hash['phash']);
1343
1344 // setting new phash_row
1345 $fields = array(
1346 'phash' => $this->hash['phash'],
1347 'phash_grouping' => $this->hash['phash_grouping'],
1348 'cHashParams' => serialize($this->cHashParams),
1349 'contentHash' => $this->content_md5h,
1350 'data_page_id' => $this->conf['id'],
1351 'data_page_reg1' => $this->conf['page_cache_reg1'],
1352 'data_page_type' => $this->conf['type'],
1353 'data_page_mp' => $this->conf['MP'],
1354 'gr_list' => $this->conf['gr_list'],
1355 'item_type' => 0, // TYPO3 page
1356 'item_title' => $this->contentParts['title'],
1357 'item_description' => $this->bodyDescription($this->contentParts),
1358 'item_mtime' => $this->conf['mtime'],
1359 'item_size' => strlen($this->conf['content']),
1360 'tstamp' => time(),
1361 'crdate' => time(),
1362 'item_crdate' => $this->conf['crdate'], // Creation date of page
1363 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1364 'externalUrl' => 0,
1365 'recordUid' => intval($this->conf['recordUid']),
1366 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1367 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1368 );
1369
1370 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1371
1372 // PROCESSING index_section
1373 $this->submit_section($this->hash['phash'],$this->hash['phash']);
1374
1375 // PROCESSING index_grlist
1376 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
1377
1378 // PROCESSING index_fulltext
1379 $fields = array(
1380 'phash' => $this->hash['phash'],
1381 'fulltextdata' => implode(' ', $this->contentParts)
1382 );
1383 if ($this->indexerConfig['fullTextDataLength']>0) {
1384 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1385 }
1386 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1387
1388 // PROCESSING index_debug
1389 if ($this->indexerConfig['debugMode']) {
1390 $fields = array(
1391 'phash' => $this->hash['phash'],
1392 'debuginfo' => serialize(array(
1393 'cHashParams' => $this->cHashParams,
1394 'external_parsers initialized' => array_keys($this->external_parsers),
1395 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
1396 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
1397 'logs' => $this->internal_log,
1398 'lexer' => $this->lexerObj->debugString,
1399 ))
1400 );
1401 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1402 }
1403 }
1404
1405 /**
1406 * Stores gr_list in the database.
1407 *
1408 * @param integer Search result record phash
1409 * @param integer Actual phash of current content
1410 * @return void
1411 * @see update_grlist()
1412 */
1413 function submit_grlist($hash,$phash_x) {
1414
1415 // Setting the gr_list record
1416 $fields = array(
1417 'phash' => $hash,
1418 'phash_x' => $phash_x,
1419 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
1420 'gr_list' => $this->conf['gr_list']
1421 );
1422 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1423 }
1424
1425 /**
1426 * Stores section
1427 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1428 *
1429 * @param integer phash of TYPO3 parent search result record
1430 * @param integer phash of the file indexation search record
1431 * @return void
1432 */
1433 function submit_section($hash,$hash_t3) {
1434 $fields = array(
1435 'phash' => $hash,
1436 'phash_t3' => $hash_t3,
1437 'page_id' => intval($this->conf['id'])
1438 );
1439
1440 $this->getRootLineFields($fields);
1441
1442 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1443 }
1444
1445 /**
1446 * Removes records for the indexed page, $phash
1447 *
1448 * @param integer phash value to flush
1449 * @return void
1450 */
1451 function removeOldIndexedPages($phash) {
1452 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1453 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
1454 foreach($tableArr as $table) {
1455 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1456 }
1457 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1458 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
1459 }
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473 /********************************
1474 *
1475 * SQL; External media
1476 *
1477 *******************************/
1478
1479
1480 /**
1481 * Updates db with information about the file
1482 *
1483 * @param array Array with phash and phash_grouping keys for file
1484 * @param string File name
1485 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1486 * @param string File extension determining the type of media.
1487 * @param integer Modification time of file.
1488 * @param integer Creation time of file.
1489 * @param integer Size of file in bytes
1490 * @param integer Content HASH value.
1491 * @param array Standard content array (using only title and body for a file)
1492 * @return void
1493 */
1494 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
1495
1496 // Find item Type:
1497 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1498 $storeItemType = $storeItemType ? $storeItemType : $ext;
1499
1500 // Remove any current data for this phash:
1501 $this->removeOldIndexedFiles($hash['phash']);
1502
1503 // Split filename:
1504 $fileParts = parse_url($file);
1505
1506 // Setting new
1507 $fields = array(
1508 'phash' => $hash['phash'],
1509 'phash_grouping' => $hash['phash_grouping'],
1510 'cHashParams' => serialize($subinfo),
1511 'contentHash' => $content_md5h,
1512 'data_filename' => $file,
1513 'item_type' => $storeItemType,
1514 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1515 'item_description' => $this->bodyDescription($contentParts),
1516 'item_mtime' => $mtime,
1517 'item_size' => $size,
1518 'item_crdate' => $ctime,
1519 'tstamp' => time(),
1520 'crdate' => time(),
1521 'gr_list' => $this->conf['gr_list'],
1522 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1523 'recordUid' => intval($this->conf['recordUid']),
1524 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1525 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1526 );
1527 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1528
1529 // PROCESSING index_fulltext
1530 $fields = array(
1531 'phash' => $hash['phash'],
1532 'fulltextdata' => implode(' ', $contentParts)
1533 );
1534 if ($this->indexerConfig['fullTextDataLength']>0) {
1535 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1536 }
1537 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1538
1539 // PROCESSING index_debug
1540 if ($this->indexerConfig['debugMode']) {
1541 $fields = array(
1542 'phash' => $hash['phash'],
1543 'debuginfo' => serialize(array(
1544 'cHashParams' => $subinfo,
1545 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
1546 'logs' => $this->internal_log,
1547 'lexer' => $this->lexerObj->debugString,
1548 ))
1549 );
1550 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1551 }
1552 }
1553
1554 /**
1555 * Stores file gr_list for a file IF it does not exist already
1556 *
1557 * @param integer phash value of file
1558 * @return void
1559 */
1560 function submitFile_grlist($hash) {
1561 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1562 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
1563 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1564 $this->submit_grlist($hash,$hash);
1565 }
1566 }
1567
1568 /**
1569 * Stores file section for a file IF it does not exist
1570 *
1571 * @param integer phash value of file
1572 * @return void
1573 */
1574 function submitFile_section($hash) {
1575 // Testing if there is a section
1576 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
1577 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1578 $this->submit_section($hash,$this->hash['phash']);
1579 }
1580 }
1581
1582 /**
1583 * Removes records for the indexed page, $phash
1584 *
1585 * @param integer phash value to flush
1586 * @return void
1587 */
1588 function removeOldIndexedFiles($phash) {
1589
1590 // Removing old registrations for tables.
1591 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
1592 foreach($tableArr as $table) {
1593 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1594 }
1595 }
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610 /********************************
1611 *
1612 * SQL Helper functions
1613 *
1614 *******************************/
1615
1616 /**
1617 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1618 * Return positive integer if the page needs to be indexed
1619 *
1620 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1621 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1622 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1623 */
1624 function checkMtimeTstamp($mtime,$phash) {
1625
1626 // Select indexed page:
1627 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
1628 $out = 0;
1629
1630 // If there was an indexing of the page...:
1631 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1632 if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time()) { // If max age is exceeded, index the page
1633 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed.
1634 } else {
1635 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time()) { // if minAge is not set or if minAge is exceeded, consider at mtime
1636 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1637 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index.
1638 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1639 } else {
1640 $out = -1; // mtime matched the document, so no changes detected and no content updated
1641 if ($this->tstamp_maxAge) {
1642 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
1643 } else {
1644 $this->updateTstamp($phash); // Update the timestatmp
1645 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
1646 }
1647 }
1648 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed.
1649 } else {$out = -2;} // The minimum age was not exceeded
1650 }
1651 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table).
1652 return $out;
1653 }
1654
1655 /**
1656 * Check content hash in phash table
1657 *
1658 * @return mixed Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1659 */
1660 function checkContentHash() {
1661 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1662 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
1663 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1664 return $row;
1665 }
1666 return 1;
1667 }
1668
1669 /**
1670 * Check content hash for external documents
1671 * Returns true if the document needs to be indexed (that is, there was no result)
1672 *
1673 * @param integer phash value to check (phash_grouping)
1674 * @param integer Content hash to check
1675 * @return boolean Returns true if the document needs to be indexed (that is, there was no result)
1676 */
1677 function checkExternalDocContentHash($hashGr,$content_md5h) {
1678 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
1679 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1680 return 0;
1681 }
1682 return 1;
1683 }
1684
1685 /**
1686 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1687 *
1688 * @param integer Phash integer to test.
1689 * @return void
1690 */
1691 function is_grlist_set($phash_x) {
1692 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
1693 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
1694 }
1695
1696 /**
1697 * Check if an grlist-entry for this hash exists and if not so, write one.
1698 *
1699 * @param integer phash of the search result that should be found
1700 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1701 * @return void
1702 * @see submit_grlist()
1703 */
1704 function update_grlist($phash,$phash_x) {
1705 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
1706 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1707 $this->submit_grlist($phash,$phash_x);
1708 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
1709 }
1710 }
1711
1712 /**
1713 * Update tstamp for a phash row.
1714 *
1715 * @param integer phash value
1716 * @param integer If set, update the mtime field to this value.
1717 * @return void
1718 */
1719 function updateTstamp($phash,$mtime=0) {
1720 $updateFields = array(
1721 'tstamp' => time()
1722 );
1723 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
1724
1725 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1726 }
1727
1728 /**
1729 * Update SetID of the index_phash record.
1730 *
1731 * @param integer phash value
1732 * @return void
1733 */
1734 function updateSetId($phash) {
1735 $updateFields = array(
1736 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1737 );
1738
1739 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1740 }
1741
1742 /**
1743 * Update parsetime for phash row.
1744 *
1745 * @param integer phash value.
1746 * @param integer Parsetime value to set.
1747 * @return void
1748 */
1749 function updateParsetime($phash,$parsetime) {
1750 $updateFields = array(
1751 'parsetime' => intval($parsetime)
1752 );
1753
1754 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1755 }
1756
1757 /**
1758 * Update section rootline for the page
1759 *
1760 * @return void
1761 */
1762 function updateRootline() {
1763
1764 $updateFields = array();
1765 $this->getRootLineFields($updateFields);
1766
1767 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
1768 }
1769
1770 /**
1771 * Adding values for root-line fields.
1772 * rl0, rl1 and rl2 are standard. A hook might add more.
1773 *
1774 * @param array Field array, passed by reference
1775 * @return void
1776 */
1777 function getRootLineFields(&$fieldArr) {
1778
1779 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
1780 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
1781 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
1782
1783 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1784 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1785 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1786 }
1787 }
1788 }
1789
1790 /**
1791 * Removes any indexed pages with userlogins which has the same contentHash
1792 * NOT USED anywhere inside this class!
1793 *
1794 * @return void
1795 */
1796 function removeLoginpagesWithContentHash() {
1797 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
1798 A.phash=B.phash
1799 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
1800 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
1801 AND A.contentHash='.intval($this->content_md5h));
1802 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1803 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
1804 $this->removeOldIndexedPages($row['phash']);
1805 }
1806 }
1807
1808 /**
1809 * Includes the crawler class
1810 *
1811 * @return void
1812 */
1813 function includeCrawlerClass() {
1814 global $TYPO3_CONF_VARS;
1815
1816 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
1817 }
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828 /********************************
1829 *
1830 * SQL; Submitting words
1831 *
1832 *******************************/
1833
1834 /**
1835 * Adds new words to db
1836 *
1837 * @param array Word List array (where each word has information about position etc).
1838 * @return void
1839 */
1840 function checkWordList($wl) {
1841 reset($wl);
1842 $phashArr = array();
1843 while(list($key,) = each($wl)) {
1844 $phashArr[] = $wl[$key]['hash'];
1845 }
1846 if (count($phashArr)) {
1847 $cwl = implode(',',$phashArr);
1848 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
1849
1850 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
1851 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
1852 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1853 unset($wl[$row['baseword']]);
1854 }
1855
1856 reset($wl);
1857 while(list($key,$val)=each($wl)) {
1858 $insertFields = array(
1859 'wid' => $val['hash'],
1860 'baseword' => $key,
1861 'metaphone' => $val['metaphone']
1862 );
1863 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1864 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1865 }
1866 }
1867 }
1868 }
1869
1870 /**
1871 * Submits RELATIONS between words and phash
1872 *
1873 * @param array Word list array
1874 * @param integer phash value
1875 * @return void
1876 */
1877 function submitWords($wl,$phash) {
1878 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
1879
1880 foreach($wl as $val) {
1881 $insertFields = array(
1882 'phash' => $phash,
1883 'wid' => $val['hash'],
1884 'count' => $val['count'],
1885 'first' => $val['first'],
1886 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
1887 'flags' => ($val['cmp'] & $this->flagBitMask)
1888 );
1889
1890 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
1891 }
1892 }
1893
1894 /**
1895 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1896 * and back.
1897 *
1898 * @param double Frequency
1899 * @return integer Frequency in range.
1900 */
1901 function freqMap($freq) {
1902 $mapFactor = $this->freqMax*100*$this->freqRange;
1903 if($freq<1) {
1904 $newFreq = $freq*$mapFactor;
1905 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
1906 } else {
1907 $newFreq = $freq/$mapFactor;
1908 }
1909 return $newFreq;
1910
1911 }
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923 /********************************
1924 *
1925 * Hashing
1926 *
1927 *******************************/
1928
1929 /**
1930 * Get search hash, T3 pages
1931 *
1932 * @return void
1933 */
1934 function setT3Hashes() {
1935
1936 // Set main array:
1937 $hArray = array(
1938 'id' => (integer)$this->conf['id'],
1939 'type' => (integer)$this->conf['type'],
1940 'sys_lang' => (integer)$this->conf['sys_language_uid'],
1941 'MP' => (string)$this->conf['MP'],
1942 'cHash' => $this->cHashParams
1943 );
1944
1945 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1946 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1947
1948 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1949 $hArray['gr_list'] = (string)$this->conf['gr_list'];
1950 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
1951 }
1952
1953 /**
1954 * Get search hash, external files
1955 *
1956 * @param string File name / path which identifies it on the server
1957 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
1958 * @return array Array with "phash_grouping" and "phash" inside.
1959 */
1960 function setExtHashes($file,$subinfo=array()) {
1961 // Set main array:
1962 $hash = array();
1963 $hArray = array(
1964 'file' => $file,
1965 );
1966
1967 // Set grouping hash:
1968 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
1969
1970 // Add subinfo
1971 $hArray['subinfo'] = $subinfo;
1972 $hash['phash'] = $this->md5inthash(serialize($hArray));
1973
1974 return $hash;
1975 }
1976
1977 /**
1978 * md5 integer hash
1979 * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
1980 *
1981 * @param string String to hash
1982 * @return integer Integer intepretation of the md5 hash of input string.
1983 */
1984 function md5inthash($str) {
1985 return hexdec(substr(md5($str),0,7));
1986 }
1987
1988 /**
1989 * Calculates the cHash value of input GET array (for constructing cHash values if needed)
1990 *
1991 * @param array Array of GET parameters to encode
1992 * @return void
1993 */
1994 function makeCHash($paramArray) {
1995 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
1996
1997 $pA = t3lib_div::cHashParams($addQueryParams);
1998
1999 return t3lib_div::shortMD5(serialize($pA));
2000 }
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013 /*********************************
2014 *
2015 * Internal logging functions
2016 *
2017 *********************************/
2018
2019 /**
2020 * Push function wrapper for TT logging
2021 *
2022 * @param string Title to set
2023 * @param string Key (?)
2024 * @return void
2025 */
2026 function log_push($msg,$key) {
2027 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
2028 }
2029
2030 /**
2031 * Pull function wrapper for TT logging
2032 *
2033 * @return void
2034 */
2035 function log_pull() {
2036 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
2037 }
2038
2039 /**
2040 * Set log message function wrapper for TT logging
2041 *
2042 * @param string Message to set
2043 * @param integer Error number
2044 * @return void
2045 */
2046 function log_setTSlogMessage($msg, $errorNum=0) {
2047 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
2048 $this->internal_log[] = $msg;
2049 }
2050
2051
2052
2053
2054
2055
2056
2057
2058 /**************************
2059 *
2060 * tslib_fe hooks:
2061 *
2062 **************************/
2063
2064 /**
2065 * Frontend hook: If the page is not being re-generated this is our chance to force it to be (because re-generation of the page is required in order to have the indexer called!)
2066 *
2067 * @param array Parameters from frontend
2068 * @param object TSFE object (reference under PHP5)
2069 * @return void
2070 */
2071 function fe_headerNoCache(&$params, $ref) {
2072
2073 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
2074 if (t3lib_extMgm::isLoaded('crawler')
2075 && $params['pObj']->applicationData['tx_crawler']['running']
2076 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions'])) {
2077
2078 // Setting simple log entry:
2079 $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData'];
2080
2081 // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached.
2082 $params['disableAcquireCacheData'] = TRUE;
2083 }
2084 }
2085 }
2086
2087
2088 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
2089 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
2090 }
2091 ?>