Fixed bug #15580: Add calls to logDeprecatedFunction() for more deprecated functions...
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2010 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 141: class tx_indexedsearch_indexer
39 * 207: function hook_indexContent(&$pObj)
40 *
41 * SECTION: Backend API
42 * 308: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
43 * 347: function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
44 * 365: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
45 *
46 * SECTION: Initialization
47 * 416: function init()
48 * 468: function initializeExternalParsers()
49 *
50 * SECTION: Indexing; TYPO3 pages (HTML content)
51 * 509: function indexTypo3PageContent()
52 * 596: function splitHTMLContent($content)
53 * 642: function getHTMLcharset($content)
54 * 657: function convertHTMLToUtf8($content,$charset='')
55 * 685: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
56 * 712: function typoSearchTags(&$body)
57 * 741: function extractLinks($content)
58 * 812: function extractHyperLinks($string)
59 *
60 * SECTION: Indexing; external URL
61 * 871: function indexExternalUrl($externalUrl)
62 * 902: function getUrlHeaders($url)
63 *
64 * SECTION: Indexing; external files (PDF, DOC, etc)
65 * 948: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
66 * 1054: function readFileContent($ext,$absFile,$cPKey)
67 * 1071: function fileContentParts($ext,$absFile)
68 * 1089: function splitRegularContent($content)
69 *
70 * SECTION: Analysing content, Extracting words
71 * 1122: function charsetEntity2utf8(&$contentArr, $charset)
72 * 1145: function processWordsInArrays($contentArr)
73 * 1170: function procesWordsInArrays($contentArr)
74 * 1180: function bodyDescription($contentArr)
75 * 1202: function indexAnalyze($content)
76 * 1223: function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
77 * 1242: function analyzeBody(&$retArr,$content)
78 * 1262: function metaphone($word,$retRaw=FALSE)
79 *
80 * SECTION: SQL; TYPO3 Pages
81 * 1304: function submitPage()
82 * 1378: function submit_grlist($hash,$phash_x)
83 * 1398: function submit_section($hash,$hash_t3)
84 * 1416: function removeOldIndexedPages($phash)
85 *
86 * SECTION: SQL; External media
87 * 1459: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
88 * 1525: function submitFile_grlist($hash)
89 * 1539: function submitFile_section($hash)
90 * 1553: function removeOldIndexedFiles($phash)
91 *
92 * SECTION: SQL Helper functions
93 * 1589: function checkMtimeTstamp($mtime,$phash)
94 * 1625: function checkContentHash()
95 * 1642: function checkExternalDocContentHash($hashGr,$content_md5h)
96 * 1656: function is_grlist_set($phash_x)
97 * 1669: function update_grlist($phash,$phash_x)
98 * 1684: function updateTstamp($phash,$mtime=0)
99 * 1699: function updateSetId($phash)
100 * 1714: function updateParsetime($phash,$parsetime)
101 * 1727: function updateRootline()
102 * 1742: function getRootLineFields(&$fieldArr)
103 * 1761: function removeLoginpagesWithContentHash()
104 * 1778: function includeCrawlerClass()
105 *
106 * SECTION: SQL; Submitting words
107 * 1805: function checkWordList($wl)
108 * 1842: function submitWords($wl,$phash)
109 * 1866: function freqMap($freq)
110 *
111 * SECTION: Hashing
112 * 1899: function setT3Hashes()
113 * 1925: function setExtHashes($file,$subinfo=array())
114 * 1949: function md5inthash($str)
115 * 1959: function makeCHash($paramArray)
116 *
117 * SECTION: Internal logging functions
118 * 1991: function log_push($msg,$key)
119 * 2000: function log_pull()
120 * 2011: function log_setTSlogMessage($msg, $errorNum=0)
121 *
122 * SECTION: tslib_fe hooks:
123 * 2036: function fe_headerNoCache(&$params, $ref)
124 *
125 * TOTAL FUNCTIONS: 59
126 * (This index is automatically created/updated by the extension "extdeveval")
127 *
128 */
129 /**
130 * Indexing class for TYPO3 frontend
131 *
132 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
133 * @package TYPO3
134 * @subpackage tx_indexedsearch
135 */
136 class tx_indexedsearch_indexer {
137
138 // Messages:
139 var $reasons = array(
140 -1 => 'mtime matched the document, so no changes detected and no content updated',
141 -2 => 'The minimum age was not exceeded',
142 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
143 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
144 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
145 4 => 'Page has never been indexed (is not represented in the index_phash table).'
146 );
147
148 // HTML code blocks to exclude from indexing:
149 var $excludeSections = 'script,style';
150
151 // Supported Extensions for external files:
152 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
153
154 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
155 var $defaultGrList = '0,-1';
156
157 // Min/Max times:
158 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
159 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
160 var $maxExternalFiles = 0; // Max number of external files to index.
161
162 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc.
163 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
164
165 // INTERNALS:
166 var $defaultContentArray=array(
167 'title' => '',
168 'description' => '',
169 'keywords' => '',
170 'body' => '',
171 );
172 var $wordcount = 0;
173 var $externalFileCounter = 0;
174
175 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
176 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
177 var $hash = array(); // Hash array, contains phash and phash_grouping
178 var $file_phash_arr = array(); // Hash array for files
179 var $contentParts = array(); // Content of TYPO3 page
180 var $content_md5h = '';
181 var $internal_log = array(); // Internal log
182 var $indexExternalUrl_content = '';
183
184 var $cHashParams = array(); // cHashparams array
185
186 var $freqRange = 32000;
187 var $freqMax = 0.1;
188
189 // Objects:
190 /**
191 * Charset class object
192 *
193 * @var t3lib_cs
194 */
195 var $csObj;
196
197 /**
198 * Metaphone object, if any
199 *
200 * @var user_DoubleMetaPhone
201 */
202 var $metaphoneObj;
203
204 /**
205 * Lexer object for word splitting
206 *
207 * @var tx_indexedsearch_lexer
208 */
209 var $lexerObj;
210
211
212
213 /**
214 * Parent Object (TSFE) Initialization
215 *
216 * @param object Parent Object (frontend TSFE object), passed by reference
217 * @return void
218 */
219 function hook_indexContent(&$pObj) {
220
221 // Indexer configuration from Extension Manager interface:
222 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
223
224 // Crawler activation:
225 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
226 if (t3lib_extMgm::isLoaded('crawler')
227 && $pObj->applicationData['tx_crawler']['running']
228 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
229
230 // Setting simple log message:
231 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
232
233 // Setting variables:
234 $this->crawlerActive = TRUE; // Crawler active flag
235 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
236 }
237
238 // Determine if page should be indexed, and if so, configure and initialize indexer
239 if ($pObj->config['config']['index_enable']) {
240 $this->log_push('Index page','');
241
242 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
243 if (!$pObj->page['no_search']) {
244 if (!$pObj->no_cache) {
245 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
246
247 // Setting up internal configuration from config array:
248 $this->conf = array();
249
250 // Information about page for which the indexing takes place
251 $this->conf['id'] = $pObj->id; // Page id
252 $this->conf['type'] = $pObj->type; // Page type
253 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
254 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
255 $this->conf['gr_list'] = $pObj->gr_list; // Group list
256
257 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
258 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
259
260 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
261 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
262
263 // Root line uids
264 $this->conf['rootline_uids'] = array();
265 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
266 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
267 }
268
269 // Content of page:
270 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
271 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
272 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
273 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
274
275 // Configuration of behavior:
276 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
277 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
278 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
279
280 // Set to zero:
281 $this->conf['recordUid'] = 0;
282 $this->conf['freeIndexUid'] = 0;
283 $this->conf['freeIndexSetId'] = 0;
284
285 // Init and start indexing:
286 $this->init();
287 $this->indexTypo3PageContent();
288 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
289 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
290 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
291 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
292 $this->log_pull();
293 }
294 }
295
296
297
298
299
300
301
302
303 /****************************
304 *
305 * Backend API
306 *
307 ****************************/
308
309 /**
310 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
311 *
312 * @param integer The page uid, &id=
313 * @param integer The page type, &type=
314 * @param integer sys_language uid, typically &L=
315 * @param string The MP variable (Mount Points), &MP=
316 * @param array Rootline array of only UIDs.
317 * @param array Array of GET variables to register with this indexing
318 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
319 * @return void
320 */
321 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
322
323 // Setting up internal configuration from config array:
324 $this->conf = array();
325
326 // Information about page for which the indexing takes place
327 $this->conf['id'] = $id; // Page id (integer)
328 $this->conf['type'] = $type; // Page type (integer)
329 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
330 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
331 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
332
333 // cHash values:
334 $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : ''; // cHash string for additional parameters
335 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
336
337 // Set to defaults
338 $this->conf['freeIndexUid'] = 0;
339 $this->conf['freeIndexSetId'] = 0;
340 $this->conf['page_cache_reg1'] = '';
341
342 // Root line uids
343 $this->conf['rootline_uids'] = $uidRL;
344
345 // Configuration of behavior:
346 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
347 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
348 $this->conf['index_metatags'] = true; // Whether to index document keywords and description (if present)
349
350 // Init and start indexing:
351 $this->init();
352 }
353
354 /**
355 * Sets the free-index uid. Can be called right after backend_initIndexer()
356 *
357 * @param integer Free index UID
358 * @param integer Set id - an integer identifying the "set" of indexing operations.
359 * @return void
360 */
361 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
362 $this->conf['freeIndexUid'] = $freeIndexUid;
363 $this->conf['freeIndexSetId'] = $freeIndexSetId;
364 }
365
366 /**
367 * Indexing records as the content of a TYPO3 page.
368 *
369 * @param string Title equivalent
370 * @param string Keywords equivalent
371 * @param string Description equivalent
372 * @param string The main content to index
373 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
374 * @param integer Last modification time, in seconds
375 * @param integer The creation date of the content, in seconds
376 * @param integer The record UID that the content comes from (for registration with the indexed rows)
377 * @return void
378 */
379 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
380
381 // Content of page:
382 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
383 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
384 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
385
386 // Construct fake HTML for parsing:
387 $this->conf['content'] = '
388 <html>
389 <head>
390 <title>'.htmlspecialchars($title).'</title>
391 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
392 <meta name="description" content="'.htmlspecialchars($description).'" />
393 </head>
394 <body>
395 '.htmlspecialchars($content).'
396 </body>
397 </html>'; // Content string (HTML of TYPO3 page)
398
399 // Initializing charset:
400 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
401 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
402
403 // Index content as if it was a TYPO3 page:
404 $this->indexTypo3PageContent();
405 }
406
407
408
409
410
411
412
413
414
415
416
417
418
419 /********************************
420 *
421 * Initialization
422 *
423 *******************************/
424
425 /**
426 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
427 *
428 * @return void
429 */
430 function init() {
431 global $TYPO3_CONF_VARS;
432
433 // Initializing:
434 $this->cHashParams = $this->conf['cHash_array'];
435 if (is_array($this->cHashParams) && count($this->cHashParams)) {
436 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
437 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
438 }
439
440 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
441 $this->setT3Hashes();
442
443 // Indexer configuration from Extension Manager interface:
444 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
445 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
446 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
447 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
448 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
449
450 // Initialize external document parsers:
451 // Example configuration, see ext_localconf.php of this file!
452 if ($this->conf['index_externals']) {
453 $this->initializeExternalParsers();
454 }
455
456 // Initialize lexer (class that deconstructs the text into words):
457 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
458 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
459 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
460 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
461 $this->lexerObj = t3lib_div::getUserObj($lexerObjRef);
462 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
463
464 // Initialize metaphone hook:
465 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
466 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
467 $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
468 $this->metaphoneObj->pObj = $this;
469 }
470
471 // Init charset class:
472 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
473 }
474
475 /**
476 * Initialize external parsers
477 *
478 * @return void
479 * @access private
480 * @see init()
481 */
482 function initializeExternalParsers() {
483 global $TYPO3_CONF_VARS;
484
485 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
486 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
487 $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef);
488 $this->external_parsers[$extension]->pObj = $this;
489
490 // Init parser and if it returns false, unset its entry again:
491 if (!$this->external_parsers[$extension]->initParser($extension)) {
492 unset($this->external_parsers[$extension]);
493 }
494 }
495 }
496 }
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512 /********************************
513 *
514 * Indexing; TYPO3 pages (HTML content)
515 *
516 *******************************/
517
518 /**
519 * Start indexing of the TYPO3 page
520 *
521 * @return void
522 */
523 function indexTypo3PageContent() {
524
525 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
526 $is_grlist = $this->is_grlist_set($this->hash['phash']);
527
528 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
529
530 // Setting message:
531 if ($this->forceIndexing) {
532 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
533 } elseif ($check > 0) {
534 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
535 } else {
536 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
537 }
538
539 // Divide into title,keywords,description and body:
540 $this->log_push('Split content','');
541 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
542 if ($this->conf['indexedDocTitle']) {
543 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
544 }
545 $this->log_pull();
546
547 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
548 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
549
550 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
551 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
552 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
553 $checkCHash = $this->checkContentHash();
554 if (!is_array($checkCHash) || $check===1) {
555 $Pstart=t3lib_div::milliseconds();
556
557 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
558 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
559 $this->log_pull();
560
561 // Splitting words
562 $this->log_push('Extract words from content','');
563 $splitInWords = $this->processWordsInArrays($this->contentParts);
564 $this->log_pull();
565
566 // Analyse the indexed words.
567 $this->log_push('Analyse the extracted words','');
568 $indexArr = $this->indexAnalyze($splitInWords);
569 $this->log_pull();
570
571 // Submitting page (phash) record
572 $this->log_push('Submitting page','');
573 $this->submitPage();
574 $this->log_pull();
575
576 // Check words and submit to word list if not there
577 $this->log_push('Check word list and submit words','');
578 $this->checkWordList($indexArr);
579 $this->submitWords($indexArr,$this->hash['phash']);
580 $this->log_pull();
581
582 // Set parsetime
583 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
584
585 // Checking external files if configured for.
586 $this->log_push('Checking external files','');
587 if ($this->conf['index_externals']) {
588 $this->extractLinks($this->conf['content']);
589 }
590 $this->log_pull();
591 } else {
592 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
593 $this->updateSetId($this->hash['phash']);
594 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
595 $this->updateRootline();
596 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
597 }
598 } else {
599 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
600 }
601 }
602
603 /**
604 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
605 *
606 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
607 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
608 * @see splitRegularContent()
609 */
610 function splitHTMLContent($content) {
611
612 // divide head from body ( u-ouh :) )
613 $contentArr = $this->defaultContentArray;
614 $contentArr['body'] = stristr($content,'<body');
615 $headPart = substr($content,0,-strlen($contentArr['body']));
616
617 // get title
618 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
619 $titleParts = explode(':',$contentArr['title'],2);
620 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
621
622 // get keywords and description metatags
623 if($this->conf['index_metatags']) {
624 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
625 for($i=0;isset($meta[$i]);$i++) {
626 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
627 if (stristr($meta[$i]['name'], 'keywords')) {
628 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
629 }
630 if (stristr($meta[$i]['name'], 'description')) {
631 $contentArr['description'] .= ',' . $meta[$i]['content'];
632 }
633 }
634 }
635
636 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
637 $this->typoSearchTags($contentArr['body']);
638
639 // Get rid of unwanted sections (ie. scripting and style stuff) in body
640 $tagList = explode(',',$this->excludeSections);
641 foreach($tagList as $tag) {
642 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
643 }
644
645 // remove tags, but first make sure we don't concatenate words by doing it
646 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
647 $contentArr['body'] = trim(strip_tags($contentArr['body']));
648
649 $contentArr['keywords'] = trim($contentArr['keywords']);
650 $contentArr['description'] = trim($contentArr['description']);
651
652 // Return array
653 return $contentArr;
654 }
655
656 /**
657 * Extract the charset value from HTML meta tag.
658 *
659 * @param string HTML content
660 * @return string The charset value if found.
661 */
662 function getHTMLcharset($content) {
663 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i',$content,$reg)) {
664 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i',$reg[0],$reg2)) {
665 return $reg2[1];
666 }
667 }
668 }
669
670 /**
671 * Converts a HTML document to utf-8
672 *
673 * @param string HTML content, any charset
674 * @param string Optional charset (otherwise extracted from HTML)
675 * @return string Converted HTML
676 */
677 function convertHTMLToUtf8($content,$charset='') {
678
679 // Find charset:
680 $charset = $charset ? $charset : $this->getHTMLcharset($content);
681 $charset = $this->csObj->parse_charset($charset);
682
683 // Convert charset:
684 if ($charset && $charset!=='utf-8') {
685 $content = $this->csObj->utf8_encode($content, $charset);
686 }
687 // Convert entities, assuming document is now UTF-8:
688 $content = $this->csObj->entities_to_utf8($content, TRUE);
689
690 return $content;
691 }
692
693 /**
694 * Finds first occurence of embracing tags and returns the embraced content and the original string with
695 * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
696 * <title> of document or removing <script>-sections
697 *
698 * @param string String to search in
699 * @param string Tag name, eg. "script"
700 * @param string Passed by reference: Content inside found tag
701 * @param string Passed by reference: Content after found tag
702 * @param string Passed by reference: Attributes of the found tag.
703 * @return boolean Returns false if tag was not found, otherwise true.
704 */
705 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
706 $endTag = '</'.$tagName.'>';
707 $startTag = '<'.$tagName;
708
709 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
710 if(!$isTagInText) return false; // if the tag was not found, return false
711
712 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
713 $afterTagInText = stristr($isTagInText,$endTag);
714 if ($afterTagInText) {
715 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
716 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
717 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
718 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
719 $tagContent='';
720 $stringAfter = $isTagInText;
721 }
722
723 return true;
724 }
725
726 /**
727 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
728 *
729 * @param string HTML Content, passed by reference
730 * @return boolean Returns true if a TYPOSEARCH_ tag was found, otherwise false.
731 */
732 function typoSearchTags(&$body) {
733 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
734
735 if(count($expBody)>1) {
736 $body = '';
737
738 foreach($expBody as $val) {
739 $part = explode('-->',$val,2);
740 if(trim($part[0])=='begin') {
741 $body.= $part[1];
742 $prev = '';
743 } elseif(trim($part[0])=='end') {
744 $body.= $prev;
745 } else {
746 $prev = $val;
747 }
748 }
749 return true;
750 } else {
751 return false;
752 }
753 }
754
755 /**
756 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
757 *
758 * @param string HTML content
759 * @return void
760 */
761 function extractLinks($content) {
762
763 // Get links:
764 $list = $this->extractHyperLinks($content);
765
766 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
767 $this->includeCrawlerClass();
768 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
769 }
770
771 // Traverse links:
772 foreach($list as $linkInfo) {
773
774 // Decode entities:
775 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
776 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
777 } else {
778 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
779 }
780
781 // Parse URL:
782 $qParts = parse_url($linkSource);
783
784 // Check for jumpurl (TYPO3 specific thing...)
785 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
786 parse_str($qParts['query'],$getP);
787 $linkSource = $getP['jumpurl'];
788 $qParts = parse_url($linkSource); // parse again due to new linkSource!
789 }
790
791 if (!$linkInfo['localPath'] && $qParts['scheme']) {
792 if ($this->indexerConfig['indexExternalURLs']) {
793 // Index external URL (http or otherwise)
794 $this->indexExternalUrl($linkSource);
795 }
796 } elseif (!$qParts['query']) {
797 $linkSource = urldecode($linkSource);
798 if (t3lib_div::isAllowedAbsPath($linkSource)) {
799 $localFile = $linkSource;
800 } else {
801 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
802 }
803 if ($localFile && @is_file($localFile)) {
804
805 // Index local file:
806 if ($linkInfo['localPath']) {
807
808 $fI = pathinfo($linkSource);
809 $ext = strtolower($fI['extension']);
810 if (is_object($crawler)) {
811 $params = array(
812 'document' => $linkSource,
813 'alturl' => $linkInfo['href'],
814 'conf' => $this->conf
815 );
816 unset($params['conf']['content']);
817
818 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
819 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
820 } else {
821 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
822 }
823 } else {
824 if (is_object($crawler)) {
825 $params = array(
826 'document' => $linkSource,
827 'conf' => $this->conf
828 );
829 unset($params['conf']['content']);
830 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
831 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
832 } else {
833 $this->indexRegularDocument($linkSource);
834 }
835 }
836 }
837 }
838 }
839 }
840
841 /**
842 * Extracts all links to external documents from the HTML content string
843 *
844 * @param string $html
845 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
846 * @see extractLinks()
847 */
848 function extractHyperLinks($html) {
849 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
850 $htmlParts = $htmlParser->splitTags('a', $html);
851 $hyperLinksData = array();
852 foreach ($htmlParts as $index => $tagData) {
853 if (($index % 2) !== 0) {
854 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
855 $firstTagName = $htmlParser->getFirstTagName($tagData);
856
857 if (strtolower($firstTagName) == 'a') {
858 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
859 $hyperLinksData[] = array(
860 'tag' => $tagData,
861 'href' => $tagAttributes[0]['href'],
862 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
863 );
864 }
865 }
866 }
867 }
868
869 return $hyperLinksData;
870 }
871
872 /**
873 * Extracts the "base href" from content string.
874 *
875 * @param string Content to analyze
876 * @return string The base href or an empty string if not found
877 */
878 public function extractBaseHref($html) {
879 $href = '';
880 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
881 $htmlParts = $htmlParser->splitTags('base', $html);
882 foreach ($htmlParts as $index => $tagData) {
883 if (($index % 2) !== 0) {
884 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
885 $firstTagName = $htmlParser->getFirstTagName($tagData);
886 if (strtolower($firstTagName) == 'base') {
887 $href = $tagAttributes[0]['href'];
888 if ($href) {
889 break;
890 }
891 }
892 }
893 }
894
895 return $href;
896 }
897
898 /******************************************
899 *
900 * Indexing; external URL
901 *
902 ******************************************/
903
904 /**
905 * Index External URLs HTML content
906 *
907 * @param string URL, eg. "http://typo3.org/"
908 * @return void
909 * @see indexRegularDocument()
910 */
911 function indexExternalUrl($externalUrl) {
912
913 // Parse External URL:
914 $qParts = parse_url($externalUrl);
915 $fI = pathinfo($qParts['path']);
916 $ext = strtolower($fI['extension']);
917
918 // Get headers:
919 $urlHeaders = $this->getUrlHeaders($externalUrl);
920 if (stristr($urlHeaders['Content-Type'],'text/html')) {
921 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
922 if (strlen($content)) {
923
924 // Create temporary file:
925 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL');
926 if ($tmpFile) {
927 t3lib_div::writeFile($tmpFile, $content);
928
929 // Index that file:
930 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
931 unlink($tmpFile);
932 }
933 }
934 }
935 }
936
937 /**
938 * Getting HTTP request headers of URL
939 *
940 * @param string The URL
941 * @param integer Timeout (seconds?)
942 * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys
943 */
944 function getUrlHeaders($url) {
945 $content = t3lib_div::getURL($url,2); // Try to get the headers only
946
947 if (strlen($content)) {
948 // Compile headers:
949 $headers = t3lib_div::trimExplode(LF,$content,1);
950 $retVal = array();
951 foreach($headers as $line) {
952 if (!strlen(trim($line))) {
953 break; // Stop at the first empty line (= end of header)
954 }
955
956 list($headKey, $headValue) = explode(':', $line, 2);
957 $retVal[$headKey] = $headValue;
958 }
959 return $retVal;
960 }
961 }
962
963
964
965 /**
966 * Checks if the file is local
967 *
968 * @param $sourcePath
969 * @return string Absolute path to file if file is local, else empty string
970 */
971 protected function createLocalPath($sourcePath) {
972 $localPath = '';
973 static $pathFunctions = array(
974 'createLocalPathFromT3vars',
975 'createLocalPathUsingAbsRefPrefix',
976 'createLocalPathUsingDomainURL',
977 'createLocalPathFromAbsoluteURL',
978 'createLocalPathFromRelativeURL'
979 );
980 foreach ($pathFunctions as $functionName) {
981 $localPath = $this->$functionName($sourcePath);
982 if ($localPath != '') {
983 break;
984 }
985 }
986 return $localPath;
987 }
988
989 /**
990 * Attempts to create a local file path from T3VARs. This is useful for
991 * various download extensions that hide actual file name but still want the
992 * file to be indexed.
993 *
994 * @param string $sourcePath
995 * @return string
996 */
997 protected function createLocalPathFromT3vars($sourcePath) {
998 $localPath = '';
999 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
1000 if (is_array($indexLocalFiles)) {
1001 $md5 = t3lib_div::shortMD5($sourcePath);
1002 // Note: not using self::isAllowedLocalFile here because this method
1003 // is allowed to index files outside of the web site (for example,
1004 // protected downloads)
1005 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
1006 $localPath = $indexLocalFiles[$md5];
1007 }
1008 }
1009 return $localPath;
1010 }
1011
1012 /**
1013 * Attempts to create a local file path by matching a current request URL.
1014 *
1015 * @param string $sourcePath
1016 * @return string
1017 */
1018 protected function createLocalPathUsingDomainURL($sourcePath) {
1019 $localPath = '';
1020 $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
1021 $baseURLLength = strlen($baseURL);
1022 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1023 $sourcePath = substr($sourcePath, $baseURLLength);
1024 $localPath = PATH_site . $sourcePath;
1025 if (!self::isAllowedLocalFile($localPath)) {
1026 $localPath = '';
1027 }
1028 }
1029 return $localPath;
1030 }
1031
1032 /**
1033 * Attempts to create a local file path by matching absRefPrefix. This
1034 * requires TSFE. If TSFE is missing, this function does nothing.
1035 *
1036 * @param string $sourcePath
1037 * @return string
1038 */
1039 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
1040 $localPath = '';
1041 if ($GLOBALS['TSFE'] instanceof tslib_fe) {
1042 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1043 $absRefPrefixLength = strlen($absRefPrefix);
1044 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1045 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1046 $localPath = PATH_site . $sourcePath;
1047 if (!self::isAllowedLocalFile($localPath)) {
1048 $localPath = '';
1049 }
1050 }
1051 }
1052 return $localPath;
1053 }
1054
1055 /**
1056 * Attempts to create a local file path from the absolute URL without
1057 * schema.
1058 *
1059 * @param string $sourcePath
1060 * @return string
1061 */
1062 protected function createLocalPathFromAbsoluteURL($sourcePath) {
1063 $localPath = '';
1064 if ($sourcePath{0} == '/') {
1065 $sourcePath = substr($sourcePath, 1);
1066 $localPath = PATH_site . $sourcePath;
1067 if (!self::isAllowedLocalFile($localPath)) {
1068 $localPath = '';
1069 }
1070 }
1071 return $localPath;
1072 }
1073
1074 /**
1075 * Attempts to create a local file path from the relative URL.
1076 *
1077 * @param string $sourcePath
1078 * @return string
1079 */
1080 protected function createLocalPathFromRelativeURL($sourcePath) {
1081 $localPath = '';
1082 if (self::isRelativeURL($sourcePath)) {
1083 $localPath = PATH_site . $sourcePath;
1084 if (!self::isAllowedLocalFile($localPath)) {
1085 $localPath = '';
1086 }
1087 }
1088 return $localPath;
1089 }
1090
1091 /**
1092 * Checks if URL is relative.
1093 *
1094 * @param string $url
1095 * @return boolean
1096 */
1097 static protected function isRelativeURL($url) {
1098 $urlParts = @parse_url($url);
1099 return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
1100 }
1101
1102 /**
1103 * Checks if the path points to the file inside the web site
1104 *
1105 * @param string $filePath
1106 * @return boolean
1107 */
1108 static protected function isAllowedLocalFile($filePath) {
1109 $filePath = t3lib_div::resolveBackPath($filePath);
1110 $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
1111 $isFile = is_file($filePath);
1112 return $insideWebPath && $isFile;
1113 }
1114
1115 /******************************************
1116 *
1117 * Indexing; external files (PDF, DOC, etc)
1118 *
1119 ******************************************/
1120
1121 /**
1122 * Indexing a regular document given as $file (relative to PATH_site, local file)
1123 *
1124 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1125 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1126 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1127 * @param string File extension for temporary file.
1128 * @return void
1129 */
1130 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
1131
1132 // Init
1133 $fI = pathinfo($file);
1134 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
1135
1136 // Create abs-path:
1137 if (!$contentTmpFile) {
1138 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
1139 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
1140 } else { // Absolute, pass-through:
1141 $absFile = $file;
1142 }
1143 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
1144 } else {
1145 $absFile = $contentTmpFile;
1146 }
1147
1148 // Indexing the document:
1149 if ($absFile && @is_file($absFile)) {
1150 if ($this->external_parsers[$ext]) {
1151 $mtime = filemtime($absFile);
1152 $cParts = $this->fileContentParts($ext,$absFile);
1153
1154 foreach($cParts as $cPKey) {
1155 $this->internal_log = array();
1156 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
1157 $Pstart = t3lib_div::milliseconds();
1158 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1159 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
1160 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1161 if ($check > 0 || $force) {
1162 if ($check > 0) {
1163 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
1164 } else {
1165 $this->log_setTSlogMessage('Indexing forced by flag',1);
1166 }
1167
1168 // Check external file counter:
1169 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1170
1171 // Divide into title,keywords,description and body:
1172 $this->log_push('Split content','');
1173 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
1174 $this->log_pull();
1175
1176 if (is_array($contentParts)) {
1177 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1178 $content_md5h = $this->md5inthash(implode($contentParts,''));
1179
1180 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1181
1182 // Increment counter:
1183 $this->externalFileCounter++;
1184
1185 // Splitting words
1186 $this->log_push('Extract words from content','');
1187 $splitInWords = $this->processWordsInArrays($contentParts);
1188 $this->log_pull();
1189
1190 // Analyse the indexed words.
1191 $this->log_push('Analyse the extracted words','');
1192 $indexArr = $this->indexAnalyze($splitInWords);
1193 $this->log_pull();
1194
1195 // Submitting page (phash) record
1196 $this->log_push('Submitting page','');
1197 $size = filesize($absFile);
1198 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1199 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
1200 $this->log_pull();
1201
1202 // Check words and submit to word list if not there
1203 $this->log_push('Check word list and submit words','');
1204 $this->checkWordList($indexArr);
1205 $this->submitWords($indexArr,$phash_arr['phash']);
1206 $this->log_pull();
1207
1208 // Set parsetime
1209 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
1210 } else {
1211 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
1212 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
1213 }
1214 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1215 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1216 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1217
1218 // Checking and setting sections:
1219 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
1220 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1221 $this->log_pull();
1222 }
1223 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1224 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1225 }
1226
1227 /**
1228 * Reads the content of an external file being indexed.
1229 * The content from the external parser MUST be returned in utf-8!
1230 *
1231 * @param string File extension, eg. "pdf", "doc" etc.
1232 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1233 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1234 * @return array Standard content array (title, description, keywords, body keys)
1235 */
1236 function readFileContent($ext,$absFile,$cPKey) {
1237
1238 // Consult relevant external document parser:
1239 if (is_object($this->external_parsers[$ext])) {
1240 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
1241 }
1242
1243 return $contentArr;
1244 }
1245
1246 /**
1247 * Creates an array with pointers to divisions of document.
1248 *
1249 * @param string File extension
1250 * @param string Absolute filename (must exist and be validated OK before calling function)
1251 * @return array Array of pointers to sections that the document should be divided into
1252 */
1253 function fileContentParts($ext,$absFile) {
1254 $cParts = array(0);
1255
1256 // Consult relevant external document parser:
1257 if (is_object($this->external_parsers[$ext])) {
1258 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
1259 }
1260
1261 return $cParts;
1262 }
1263
1264 /**
1265 * Splits non-HTML content (from external files for instance)
1266 *
1267 * @param string Input content (non-HTML) to index.
1268 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1269 * @see splitHTMLContent()
1270 */
1271 function splitRegularContent($content) {
1272 $contentArr = $this->defaultContentArray;
1273 $contentArr['body'] = $content;
1274
1275 return $contentArr;
1276 }
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291 /**********************************
1292 *
1293 * Analysing content, Extracting words
1294 *
1295 **********************************/
1296
1297 /**
1298 * Convert character set and HTML entities in the value of input content array keys
1299 *
1300 * @param array Standard content array
1301 * @param string Charset of the input content (converted to utf-8)
1302 * @return void
1303 */
1304 function charsetEntity2utf8(&$contentArr, $charset) {
1305
1306 // Convert charset if necessary
1307 foreach ($contentArr as $key => $value) {
1308 if (strlen($contentArr[$key])) {
1309
1310 if ($charset!=='utf-8') {
1311 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1312 }
1313
1314 // decode all numeric / html-entities in the string to real characters:
1315 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
1316 }
1317 }
1318 }
1319
1320 /**
1321 * Processing words in the array from split*Content -functions
1322 *
1323 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1324 * @return array Content input array modified so each key is not a unique array of words
1325 */
1326 function processWordsInArrays($contentArr) {
1327
1328 // split all parts to words
1329 foreach ($contentArr as $key => $value) {
1330 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1331 }
1332
1333 // For title, keywords, and description we don't want duplicates:
1334 $contentArr['title'] = array_unique($contentArr['title']);
1335 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1336 $contentArr['description'] = array_unique($contentArr['description']);
1337
1338 // Return modified array:
1339 return $contentArr;
1340 }
1341
1342 /**
1343 * Processing words in the array from split*Content -functions
1344 * This function is only a wrapper because the function has been removed (see above).
1345 *
1346 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1347 * @return array Content input array modified so each key is not a unique array of words
1348 * @deprecated since TYPO3 4.0, this function will be removed in TYPO3 4.6.
1349 */
1350 function procesWordsInArrays($contentArr) {
1351 t3lib_div::logDeprecatedFunction();
1352
1353 return $this->processWordsInArrays($contentArr);
1354 }
1355
1356 /**
1357 * Extracts the sample description text from the content array.
1358 *
1359 * @param array Content array
1360 * @return string Description string
1361 */
1362 function bodyDescription($contentArr) {
1363
1364 // Setting description
1365 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
1366 if ($maxL) {
1367 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
1368 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
1369 $bodyDescription = str_replace(array(' ',TAB,CR,LF),' ',$contentArr['body']);
1370
1371 // Shorten the string:
1372 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1373 }
1374
1375 return $bodyDescription;
1376 }
1377
1378 /**
1379 * Analyzes content to use for indexing,
1380 *
1381 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1382 * @return array Index Array (whatever that is...)
1383 */
1384 function indexAnalyze($content) {
1385 $indexArr = Array();
1386 $counter = 0;
1387
1388 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
1389 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
1390 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
1391 $this->analyzeBody($indexArr,$content);
1392
1393 return ($indexArr);
1394 }
1395
1396 /**
1397 * Calculates relevant information for headercontent
1398 *
1399 * @param array Index array, passed by reference
1400 * @param array Standard content array
1401 * @param string Key from standard content array
1402 * @param integer Bit-wise priority to type
1403 * @return void
1404 */
1405 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
1406 foreach ($content[$key] as $val) {
1407 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1408 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
1409 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1410 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1411 $retArr[$val]['metaphone'] = $this->metaphone($val);
1412 $this->wordcount++;
1413 }
1414 }
1415
1416 /**
1417 * Calculates relevant information for bodycontent
1418 *
1419 * @param array Index array, passed by reference
1420 * @param array Standard content array
1421 * @return void
1422 */
1423 function analyzeBody(&$retArr,$content) {
1424 foreach($content['body'] as $key => $val) {
1425 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same.
1426 if(!isset($retArr[$val])) {
1427 $retArr[$val]['first'] = $key;
1428 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
1429 $retArr[$val]['metaphone'] = $this->metaphone($val);
1430 }
1431 $retArr[$val]['count'] = $retArr[$val]['count']+1;
1432 $this->wordcount++;
1433 }
1434 }
1435
1436 /**
1437 * Creating metaphone based hash from input word
1438 *
1439 * @param string Word to convert
1440 * @param boolean If set, returns the raw metaphone value (not hashed)
1441 * @return mixed Metaphone hash integer (or raw value, string)
1442 */
1443 function metaphone($word,$retRaw=FALSE) {
1444
1445 if (is_object($this->metaphoneObj)) {
1446 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1447 } else {
1448 $tmp = metaphone($word);
1449 }
1450
1451 // Return raw value?
1452 if ($retRaw) return $tmp;
1453
1454 // Otherwise create hash and return integer
1455 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
1456 return $ret;
1457 }
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474 /********************************
1475 *
1476 * SQL; TYPO3 Pages
1477 *
1478 *******************************/
1479
1480 /**
1481 * Updates db with information about the page (TYPO3 page, not external media)
1482 *
1483 * @return void
1484 */
1485 function submitPage() {
1486
1487 // Remove any current data for this phash:
1488 $this->removeOldIndexedPages($this->hash['phash']);
1489
1490 // setting new phash_row
1491 $fields = array(
1492 'phash' => $this->hash['phash'],
1493 'phash_grouping' => $this->hash['phash_grouping'],
1494 'cHashParams' => serialize($this->cHashParams),
1495 'contentHash' => $this->content_md5h,
1496 'data_page_id' => $this->conf['id'],
1497 'data_page_reg1' => $this->conf['page_cache_reg1'],
1498 'data_page_type' => $this->conf['type'],
1499 'data_page_mp' => $this->conf['MP'],
1500 'gr_list' => $this->conf['gr_list'],
1501 'item_type' => 0, // TYPO3 page
1502 'item_title' => $this->contentParts['title'],
1503 'item_description' => $this->bodyDescription($this->contentParts),
1504 'item_mtime' => $this->conf['mtime'],
1505 'item_size' => strlen($this->conf['content']),
1506 'tstamp' => $GLOBALS['EXEC_TIME'],
1507 'crdate' => $GLOBALS['EXEC_TIME'],
1508 'item_crdate' => $this->conf['crdate'], // Creation date of page
1509 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1510 'externalUrl' => 0,
1511 'recordUid' => intval($this->conf['recordUid']),
1512 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1513 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1514 );
1515
1516 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1517
1518 // PROCESSING index_section
1519 $this->submit_section($this->hash['phash'],$this->hash['phash']);
1520
1521 // PROCESSING index_grlist
1522 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
1523
1524 // PROCESSING index_fulltext
1525 $fields = array(
1526 'phash' => $this->hash['phash'],
1527 'fulltextdata' => implode(' ', $this->contentParts)
1528 );
1529 if ($this->indexerConfig['fullTextDataLength']>0) {
1530 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1531 }
1532 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1533
1534 // PROCESSING index_debug
1535 if ($this->indexerConfig['debugMode']) {
1536 $fields = array(
1537 'phash' => $this->hash['phash'],
1538 'debuginfo' => serialize(array(
1539 'cHashParams' => $this->cHashParams,
1540 'external_parsers initialized' => array_keys($this->external_parsers),
1541 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
1542 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
1543 'logs' => $this->internal_log,
1544 'lexer' => $this->lexerObj->debugString,
1545 ))
1546 );
1547 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1548 }
1549 }
1550
1551 /**
1552 * Stores gr_list in the database.
1553 *
1554 * @param integer Search result record phash
1555 * @param integer Actual phash of current content
1556 * @return void
1557 * @see update_grlist()
1558 */
1559 function submit_grlist($hash,$phash_x) {
1560
1561 // Setting the gr_list record
1562 $fields = array(
1563 'phash' => $hash,
1564 'phash_x' => $phash_x,
1565 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
1566 'gr_list' => $this->conf['gr_list']
1567 );
1568 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1569 }
1570
1571 /**
1572 * Stores section
1573 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1574 *
1575 * @param integer phash of TYPO3 parent search result record
1576 * @param integer phash of the file indexation search record
1577 * @return void
1578 */
1579 function submit_section($hash,$hash_t3) {
1580 $fields = array(
1581 'phash' => $hash,
1582 'phash_t3' => $hash_t3,
1583 'page_id' => intval($this->conf['id'])
1584 );
1585
1586 $this->getRootLineFields($fields);
1587
1588 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1589 }
1590
1591 /**
1592 * Removes records for the indexed page, $phash
1593 *
1594 * @param integer phash value to flush
1595 * @return void
1596 */
1597 function removeOldIndexedPages($phash) {
1598 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1599 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
1600 foreach($tableArr as $table) {
1601 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1602 }
1603 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1604 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
1605 }
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619 /********************************
1620 *
1621 * SQL; External media
1622 *
1623 *******************************/
1624
1625
1626 /**
1627 * Updates db with information about the file
1628 *
1629 * @param array Array with phash and phash_grouping keys for file
1630 * @param string File name
1631 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1632 * @param string File extension determining the type of media.
1633 * @param integer Modification time of file.
1634 * @param integer Creation time of file.
1635 * @param integer Size of file in bytes
1636 * @param integer Content HASH value.
1637 * @param array Standard content array (using only title and body for a file)
1638 * @return void
1639 */
1640 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
1641
1642 // Find item Type:
1643 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1644 $storeItemType = $storeItemType ? $storeItemType : $ext;
1645
1646 // Remove any current data for this phash:
1647 $this->removeOldIndexedFiles($hash['phash']);
1648
1649 // Split filename:
1650 $fileParts = parse_url($file);
1651
1652 // Setting new
1653 $fields = array(
1654 'phash' => $hash['phash'],
1655 'phash_grouping' => $hash['phash_grouping'],
1656 'cHashParams' => serialize($subinfo),
1657 'contentHash' => $content_md5h,
1658 'data_filename' => $file,
1659 'item_type' => $storeItemType,
1660 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1661 'item_description' => $this->bodyDescription($contentParts),
1662 'item_mtime' => $mtime,
1663 'item_size' => $size,
1664 'item_crdate' => $ctime,
1665 'tstamp' => $GLOBALS['EXEC_TIME'],
1666 'crdate' => $GLOBALS['EXEC_TIME'],
1667 'gr_list' => $this->conf['gr_list'],
1668 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1669 'recordUid' => intval($this->conf['recordUid']),
1670 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1671 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1672 );
1673 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1674
1675 // PROCESSING index_fulltext
1676 $fields = array(
1677 'phash' => $hash['phash'],
1678 'fulltextdata' => implode(' ', $contentParts)
1679 );
1680 if ($this->indexerConfig['fullTextDataLength']>0) {
1681 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1682 }
1683 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1684
1685 // PROCESSING index_debug
1686 if ($this->indexerConfig['debugMode']) {
1687 $fields = array(
1688 'phash' => $hash['phash'],
1689 'debuginfo' => serialize(array(
1690 'cHashParams' => $subinfo,
1691 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
1692 'logs' => $this->internal_log,
1693 'lexer' => $this->lexerObj->debugString,
1694 ))
1695 );
1696 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1697 }
1698 }
1699
1700 /**
1701 * Stores file gr_list for a file IF it does not exist already
1702 *
1703 * @param integer phash value of file
1704 * @return void
1705 */
1706 function submitFile_grlist($hash) {
1707 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1708 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
1709 'phash',
1710 'index_grlist',
1711 'phash=' . intval($hash) .
1712 ' AND (hash_gr_list=' . $this->md5inthash($this->defaultGrList) .
1713 ' OR hash_gr_list=' . $this->md5inthash($this->conf['gr_list']) . ')'
1714 );
1715 if (!$count) {
1716 $this->submit_grlist($hash,$hash);
1717 }
1718 }
1719
1720 /**
1721 * Stores file section for a file IF it does not exist
1722 *
1723 * @param integer phash value of file
1724 * @return void
1725 */
1726 function submitFile_section($hash) {
1727 // Testing if there is a section
1728 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
1729 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1730 $this->submit_section($hash,$this->hash['phash']);
1731 }
1732 }
1733
1734 /**
1735 * Removes records for the indexed page, $phash
1736 *
1737 * @param integer phash value to flush
1738 * @return void
1739 */
1740 function removeOldIndexedFiles($phash) {
1741
1742 // Removing old registrations for tables.
1743 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
1744 foreach($tableArr as $table) {
1745 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
1746 }
1747 }
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762 /********************************
1763 *
1764 * SQL Helper functions
1765 *
1766 *******************************/
1767
1768 /**
1769 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1770 * Return positive integer if the page needs to be indexed
1771 *
1772 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1773 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1774 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1775 */
1776 function checkMtimeTstamp($mtime,$phash) {
1777
1778 // Select indexed page:
1779 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
1780 $out = 0;
1781
1782 // If there was an indexing of the page...:
1783 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1784 if ($this->tstamp_maxAge && ($row['tstamp'] + $this->tstamp_maxAge) < $GLOBALS['EXEC_TIME']) { // If max age is exceeded, index the page
1785 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed.
1786 } else {
1787 if (!$this->tstamp_minAge || ($row['tstamp'] + $this->tstamp_minAge) < $GLOBALS['EXEC_TIME']) { // if minAge is not set or if minAge is exceeded, consider at mtime
1788 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1789 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index.
1790 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1791 } else {
1792 $out = -1; // mtime matched the document, so no changes detected and no content updated
1793 if ($this->tstamp_maxAge) {
1794 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1795 } else {
1796 $this->updateTstamp($phash); // Update the timestatmp
1797 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
1798 }
1799 }
1800 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed.
1801 } else {$out = -2;} // The minimum age was not exceeded
1802 }
1803 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table).
1804 return $out;
1805 }
1806
1807 /**
1808 * Check content hash in phash table
1809 *
1810 * @return mixed Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1811 */
1812 function checkContentHash() {
1813 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1814 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
1815 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1816 return $row;
1817 }
1818 return 1;
1819 }
1820
1821 /**
1822 * Check content hash for external documents
1823 * Returns true if the document needs to be indexed (that is, there was no result)
1824 *
1825 * @param integer phash value to check (phash_grouping)
1826 * @param integer Content hash to check
1827 * @return boolean Returns true if the document needs to be indexed (that is, there was no result)
1828 */
1829 function checkExternalDocContentHash($hashGr,$content_md5h) {
1830 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
1831 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1832 return 0;
1833 }
1834 return 1;
1835 }
1836
1837 /**
1838 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1839 *
1840 * @param integer Phash integer to test.
1841 * @return void
1842 */
1843 function is_grlist_set($phash_x) {
1844 return $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
1845 'phash_x',
1846 'index_grlist',
1847 'phash_x=' . intval($phash_x)
1848 );
1849 }
1850
1851 /**
1852 * Check if an grlist-entry for this hash exists and if not so, write one.
1853 *
1854 * @param integer phash of the search result that should be found
1855 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1856 * @return void
1857 * @see submit_grlist()
1858 */
1859 function update_grlist($phash,$phash_x) {
1860 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
1861 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
1862 $this->submit_grlist($phash,$phash_x);
1863 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
1864 }
1865 }
1866
1867 /**
1868 * Update tstamp for a phash row.
1869 *
1870 * @param integer phash value
1871 * @param integer If set, update the mtime field to this value.
1872 * @return void
1873 */
1874 function updateTstamp($phash,$mtime=0) {
1875 $updateFields = array(
1876 'tstamp' => $GLOBALS['EXEC_TIME']
1877 );
1878 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
1879
1880 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1881 }
1882
1883 /**
1884 * Update SetID of the index_phash record.
1885 *
1886 * @param integer phash value
1887 * @return void
1888 */
1889 function updateSetId($phash) {
1890 $updateFields = array(
1891 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1892 );
1893
1894 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1895 }
1896
1897 /**
1898 * Update parsetime for phash row.
1899 *
1900 * @param integer phash value.
1901 * @param integer Parsetime value to set.
1902 * @return void
1903 */
1904 function updateParsetime($phash,$parsetime) {
1905 $updateFields = array(
1906 'parsetime' => intval($parsetime)
1907 );
1908
1909 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
1910 }
1911
1912 /**
1913 * Update section rootline for the page
1914 *
1915 * @return void
1916 */
1917 function updateRootline() {
1918
1919 $updateFields = array();
1920 $this->getRootLineFields($updateFields);
1921
1922 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
1923 }
1924
1925 /**
1926 * Adding values for root-line fields.
1927 * rl0, rl1 and rl2 are standard. A hook might add more.
1928 *
1929 * @param array Field array, passed by reference
1930 * @return void
1931 */
1932 function getRootLineFields(&$fieldArr) {
1933
1934 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
1935 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
1936 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
1937
1938 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1939 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1940 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1941 }
1942 }
1943 }
1944
1945 /**
1946 * Removes any indexed pages with userlogins which has the same contentHash
1947 * NOT USED anywhere inside this class!
1948 *
1949 * @return void
1950 */
1951 function removeLoginpagesWithContentHash() {
1952 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
1953 A.phash=B.phash
1954 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
1955 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
1956 AND A.contentHash='.intval($this->content_md5h));
1957 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
1958 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
1959 $this->removeOldIndexedPages($row['phash']);
1960 }
1961 }
1962
1963 /**
1964 * Includes the crawler class
1965 *
1966 * @return void
1967 */
1968 function includeCrawlerClass() {
1969 global $TYPO3_CONF_VARS;
1970
1971 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
1972 }
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983 /********************************
1984 *
1985 * SQL; Submitting words
1986 *
1987 *******************************/
1988
1989 /**
1990 * Adds new words to db
1991 *
1992 * @param array Word List array (where each word has information about position etc).
1993 * @return void
1994 */
1995 function checkWordList($wl) {
1996 $phashArr = array();
1997 foreach ($wl as $key => $value) {
1998 $phashArr[] = $wl[$key]['hash'];
1999 }
2000 if (count($phashArr)) {
2001 $cwl = implode(',',$phashArr);
2002 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
2003
2004 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
2005 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
2006 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
2007 unset($wl[$row['baseword']]);
2008 }
2009
2010 foreach ($wl as $key => $val) {
2011 $insertFields = array(
2012 'wid' => $val['hash'],
2013 'baseword' => $key,
2014 'metaphone' => $val['metaphone']
2015 );
2016 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
2017 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
2018 }
2019 }
2020 }
2021 }
2022
2023 /**
2024 * Submits RELATIONS between words and phash
2025 *
2026 * @param array Word list array
2027 * @param integer phash value
2028 * @return void
2029 */
2030 function submitWords($wl,$phash) {
2031 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
2032
2033 foreach($wl as $val) {
2034 $insertFields = array(
2035 'phash' => $phash,
2036 'wid' => $val['hash'],
2037 'count' => $val['count'],
2038 'first' => $val['first'],
2039 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
2040 'flags' => ($val['cmp'] & $this->flagBitMask)
2041 );
2042
2043 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2044 }
2045 }
2046
2047 /**
2048 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2049 * and back.
2050 *
2051 * @param double Frequency
2052 * @return integer Frequency in range.
2053 */
2054 function freqMap($freq) {
2055 $mapFactor = $this->freqMax*100*$this->freqRange;
2056 if($freq<1) {
2057 $newFreq = $freq*$mapFactor;
2058 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
2059 } else {
2060 $newFreq = $freq/$mapFactor;
2061 }
2062 return $newFreq;
2063
2064 }
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076 /********************************
2077 *
2078 * Hashing
2079 *
2080 *******************************/
2081
2082 /**
2083 * Get search hash, T3 pages
2084 *
2085 * @return void
2086 */
2087 function setT3Hashes() {
2088
2089 // Set main array:
2090 $hArray = array(
2091 'id' => (integer)$this->conf['id'],
2092 'type' => (integer)$this->conf['type'],
2093 'sys_lang' => (integer)$this->conf['sys_language_uid'],
2094 'MP' => (string)$this->conf['MP'],
2095 'cHash' => $this->cHashParams
2096 );
2097
2098 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2099 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
2100
2101 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2102 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2103 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
2104 }
2105
2106 /**
2107 * Get search hash, external files
2108 *
2109 * @param string File name / path which identifies it on the server
2110 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2111 * @return array Array with "phash_grouping" and "phash" inside.
2112 */
2113 function setExtHashes($file,$subinfo=array()) {
2114 // Set main array:
2115 $hash = array();
2116 $hArray = array(
2117 'file' => $file,
2118 );
2119
2120 // Set grouping hash:
2121 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
2122
2123 // Add subinfo
2124 $hArray['subinfo'] = $subinfo;
2125 $hash['phash'] = $this->md5inthash(serialize($hArray));
2126
2127 return $hash;
2128 }
2129
2130 /**
2131 * md5 integer hash
2132 * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
2133 *
2134 * @param string String to hash
2135 * @return integer Integer intepretation of the md5 hash of input string.
2136 */
2137 function md5inthash($str) {
2138 return hexdec(substr(md5($str),0,7));
2139 }
2140
2141 /**
2142 * Calculates the cHash value of input GET array (for constructing cHash values if needed)
2143 *
2144 * @param array Array of GET parameters to encode
2145 * @return void
2146 * @deprecated since TYPO3 4.3, this function will be removed in TYPO3 4.6, use directly t3lib_div::calculateCHash()
2147 */
2148 function makeCHash($paramArray) {
2149 t3lib_div::logDeprecatedFunction();
2150
2151 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
2152
2153 $pA = t3lib_div::cHashParams($addQueryParams);
2154
2155 return t3lib_div::shortMD5(serialize($pA));
2156 }
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169 /*********************************
2170 *
2171 * Internal logging functions
2172 *
2173 *********************************/
2174
2175 /**
2176 * Push function wrapper for TT logging
2177 *
2178 * @param string Title to set
2179 * @param string Key (?)
2180 * @return void
2181 */
2182 function log_push($msg,$key) {
2183 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
2184 }
2185
2186 /**
2187 * Pull function wrapper for TT logging
2188 *
2189 * @return void
2190 */
2191 function log_pull() {
2192 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
2193 }
2194
2195 /**
2196 * Set log message function wrapper for TT logging
2197 *
2198 * @param string Message to set
2199 * @param integer Error number
2200 * @return void
2201 */
2202 function log_setTSlogMessage($msg, $errorNum=0) {
2203 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
2204 $this->internal_log[] = $msg;
2205 }
2206
2207
2208
2209
2210
2211
2212
2213
2214 /**************************
2215 *
2216 * tslib_fe hooks:
2217 *
2218 **************************/
2219
2220 /**
2221 * Frontend hook: If the page is not being re-generated this is our chance to force it to be (because re-generation of the page is required in order to have the indexer called!)
2222 *
2223 * @param array Parameters from frontend
2224 * @param object TSFE object (reference under PHP5)
2225 * @return void
2226 * @deprecated since TYPO3 4.3, this function will be removed in TYPO3 4.6, the method was extracted to hooks/class.tx_indexedsearch_tslib_fe_hook.php
2227 */
2228 function fe_headerNoCache(&$params, $ref) {
2229 t3lib_div::logDeprecatedFunction();
2230
2231 require_once t3lib_extMgm::extPath('indexed_search') . 'hooks/class.tx_indexedsearch_tslib_fe_hook.php';
2232 t3lib_div::makeInstance('tx_indexedsearch_tslib_fe_hook')->headerNoCache($params, $ref);
2233 }
2234
2235 /**
2236 * Makes sure that keywords are space-separated. This is impotant for their
2237 * proper displaying as a part of fulltext index.
2238 *
2239 * @param string $keywordList
2240 * @return string
2241 * @see http://bugs.typo3.org/view.php?id=1436
2242 */
2243 protected function addSpacesToKeywordList($keywordList) {
2244 $keywords = t3lib_div::trimExplode(',', $keywordList);
2245 return ' ' . implode(', ', $keywords) . ' ';
2246 }
2247 }
2248
2249
2250 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
2251 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
2252 }
2253 ?>