[TASK] Unify backend: header/section
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * This class is a search indexer for TYPO3
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Originally Christian Jul Jensen <christian@jul.net> helped as well.
32 */
33 /**
34 * Indexing class for TYPO3 frontend
35 *
36 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
37 * @package TYPO3
38 * @subpackage tx_indexedsearch
39 */
40 class tx_indexedsearch_indexer {
41
42 // Messages:
43 var $reasons = array(
44 -1 => 'mtime matched the document, so no changes detected and no content updated',
45 -2 => 'The minimum age was not exceeded',
46 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
47 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
48 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
49 4 => 'Page has never been indexed (is not represented in the index_phash table).'
50 );
51
52 // HTML code blocks to exclude from indexing:
53 var $excludeSections = 'script,style';
54
55 // Supported Extensions for external files:
56 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
57
58 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
59 var $defaultGrList = '0,-1';
60
61 // Min/Max times:
62 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
63 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
64 var $maxExternalFiles = 0; // Max number of external files to index.
65
66 var $forceIndexing = FALSE; // If TRUE, indexing is forced despite of hashes etc.
67 var $crawlerActive = FALSE; // Set when crawler is detected (internal)
68
69 // INTERNALS:
70 var $defaultContentArray=array(
71 'title' => '',
72 'description' => '',
73 'keywords' => '',
74 'body' => '',
75 );
76 var $wordcount = 0;
77 var $externalFileCounter = 0;
78
79 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
80 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
81 var $hash = array(); // Hash array, contains phash and phash_grouping
82 var $file_phash_arr = array(); // Hash array for files
83 var $contentParts = array(); // Content of TYPO3 page
84 var $content_md5h = '';
85 var $internal_log = array(); // Internal log
86 var $indexExternalUrl_content = '';
87
88 var $cHashParams = array(); // cHashparams array
89
90 var $freqRange = 32000;
91 var $freqMax = 0.1;
92
93 var $enableMetaphoneSearch = FALSE;
94 var $storeMetaphoneInfoAsWords;
95 var $metaphoneContent = '';
96
97 // Objects:
98 /**
99 * Charset class object
100 *
101 * @var t3lib_cs
102 */
103 var $csObj;
104
105 /**
106 * Metaphone object, if any
107 *
108 * @var user_DoubleMetaPhone
109 */
110 var $metaphoneObj;
111
112 /**
113 * Lexer object for word splitting
114 *
115 * @var tx_indexedsearch_lexer
116 */
117 var $lexerObj;
118
119 var $flagBitMask;
120
121 /**
122 * Parent Object (TSFE) Initialization
123 *
124 * @param object Parent Object (frontend TSFE object), passed by reference
125 * @return void
126 */
127 function hook_indexContent(&$pObj) {
128
129 // Indexer configuration from Extension Manager interface:
130 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
131
132 // Crawler activation:
133 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
134 if (t3lib_extMgm::isLoaded('crawler')
135 && $pObj->applicationData['tx_crawler']['running']
136 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
137
138 // Setting simple log message:
139 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
140
141 // Setting variables:
142 $this->crawlerActive = TRUE; // Crawler active flag
143 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
144 }
145
146 // Determine if page should be indexed, and if so, configure and initialize indexer
147 if ($pObj->config['config']['index_enable']) {
148 $this->log_push('Index page','');
149
150 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
151 if (!$pObj->page['no_search']) {
152 if (!$pObj->no_cache) {
153 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
154
155 // Setting up internal configuration from config array:
156 $this->conf = array();
157
158 // Information about page for which the indexing takes place
159 $this->conf['id'] = $pObj->id; // Page id
160 $this->conf['type'] = $pObj->type; // Page type
161 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
162 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
163 $this->conf['gr_list'] = $pObj->gr_list; // Group list
164
165 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
166 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
167
168 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
169 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
170
171 // Root line uids
172 $this->conf['rootline_uids'] = array();
173 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
174 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
175 }
176
177 // Content of page:
178 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
179 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
180 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
181 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
182
183 // Configuration of behavior:
184 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
185 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
186 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
187
188 // Set to zero:
189 $this->conf['recordUid'] = 0;
190 $this->conf['freeIndexUid'] = 0;
191 $this->conf['freeIndexSetId'] = 0;
192
193 // Init and start indexing:
194 $this->init();
195 $this->indexTypo3PageContent();
196 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
197 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
198 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
199 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
200 $this->log_pull();
201 }
202 }
203
204
205
206
207
208
209
210
211 /****************************
212 *
213 * Backend API
214 *
215 ****************************/
216
217 /**
218 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
219 *
220 * @param integer The page uid, &id=
221 * @param integer The page type, &type=
222 * @param integer sys_language uid, typically &L=
223 * @param string The MP variable (Mount Points), &MP=
224 * @param array Rootline array of only UIDs.
225 * @param array Array of GET variables to register with this indexing
226 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
227 * @return void
228 */
229 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
230
231 // Setting up internal configuration from config array:
232 $this->conf = array();
233
234 // Information about page for which the indexing takes place
235 $this->conf['id'] = $id; // Page id (integer)
236 $this->conf['type'] = $type; // Page type (integer)
237 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
238 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
239 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
240
241 // cHash values:
242 $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : ''; // cHash string for additional parameters
243 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
244
245 // Set to defaults
246 $this->conf['freeIndexUid'] = 0;
247 $this->conf['freeIndexSetId'] = 0;
248 $this->conf['page_cache_reg1'] = '';
249
250 // Root line uids
251 $this->conf['rootline_uids'] = $uidRL;
252
253 // Configuration of behavior:
254 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
255 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
256 $this->conf['index_metatags'] = TRUE; // Whether to index document keywords and description (if present)
257
258 // Init and start indexing:
259 $this->init();
260 }
261
262 /**
263 * Sets the free-index uid. Can be called right after backend_initIndexer()
264 *
265 * @param integer Free index UID
266 * @param integer Set id - an integer identifying the "set" of indexing operations.
267 * @return void
268 */
269 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
270 $this->conf['freeIndexUid'] = $freeIndexUid;
271 $this->conf['freeIndexSetId'] = $freeIndexSetId;
272 }
273
274 /**
275 * Indexing records as the content of a TYPO3 page.
276 *
277 * @param string Title equivalent
278 * @param string Keywords equivalent
279 * @param string Description equivalent
280 * @param string The main content to index
281 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
282 * @param integer Last modification time, in seconds
283 * @param integer The creation date of the content, in seconds
284 * @param integer The record UID that the content comes from (for registration with the indexed rows)
285 * @return void
286 */
287 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
288
289 // Content of page:
290 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
291 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
292 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
293
294 // Construct fake HTML for parsing:
295 $this->conf['content'] = '
296 <html>
297 <head>
298 <title>'.htmlspecialchars($title).'</title>
299 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
300 <meta name="description" content="'.htmlspecialchars($description).'" />
301 </head>
302 <body>
303 '.htmlspecialchars($content).'
304 </body>
305 </html>'; // Content string (HTML of TYPO3 page)
306
307 // Initializing charset:
308 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
309 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
310
311 // Index content as if it was a TYPO3 page:
312 $this->indexTypo3PageContent();
313 }
314
315
316
317
318
319
320
321
322
323
324
325
326
327 /********************************
328 *
329 * Initialization
330 *
331 *******************************/
332
333 /**
334 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
335 *
336 * @return void
337 */
338 function init() {
339 global $TYPO3_CONF_VARS;
340
341 // Initializing:
342 $this->cHashParams = $this->conf['cHash_array'];
343 if (is_array($this->cHashParams) && count($this->cHashParams)) {
344 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
345 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
346 }
347
348 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
349 $this->setT3Hashes();
350
351 // Indexer configuration from Extension Manager interface:
352 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
353 $this->tstamp_minAge = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['minAge']*3600,0);
354 $this->tstamp_maxAge = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['maxAge']*3600,0);
355 $this->maxExternalFiles = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
356 $this->flagBitMask = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['flagBitMask'],0,255);
357
358 // Workaround: If the extension configuration was not updated yet, the value is not existing
359 $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
360
361 $this->storeMetaphoneInfoAsWords = tx_indexedsearch_util::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
362
363 // Initialize external document parsers:
364 // Example configuration, see ext_localconf.php of this file!
365 if ($this->conf['index_externals']) {
366 $this->initializeExternalParsers();
367 }
368
369 // Initialize lexer (class that deconstructs the text into words):
370 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
371 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
372 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
373 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
374 $this->lexerObj = t3lib_div::getUserObj($lexerObjRef);
375 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
376
377 // Initialize metaphone hook:
378 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
379 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
380 if ($this->enableMetaphoneSearch && $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
381 $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
382 $this->metaphoneObj->pObj = $this;
383 }
384
385 // Init charset class:
386 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
387 }
388
389 /**
390 * Initialize external parsers
391 *
392 * @return void
393 * @access private
394 * @see init()
395 */
396 function initializeExternalParsers() {
397 global $TYPO3_CONF_VARS;
398
399 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
400 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
401 $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef);
402 $this->external_parsers[$extension]->pObj = $this;
403
404 // Init parser and if it returns FALSE, unset its entry again:
405 if (!$this->external_parsers[$extension]->initParser($extension)) {
406 unset($this->external_parsers[$extension]);
407 }
408 }
409 }
410 }
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426 /********************************
427 *
428 * Indexing; TYPO3 pages (HTML content)
429 *
430 *******************************/
431
432 /**
433 * Start indexing of the TYPO3 page
434 *
435 * @return void
436 */
437 function indexTypo3PageContent() {
438
439 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
440 $is_grlist = $this->is_grlist_set($this->hash['phash']);
441
442 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
443
444 // Setting message:
445 if ($this->forceIndexing) {
446 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
447 } elseif ($check > 0) {
448 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
449 } else {
450 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
451 }
452
453 // Divide into title,keywords,description and body:
454 $this->log_push('Split content','');
455 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
456 if ($this->conf['indexedDocTitle']) {
457 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
458 }
459 $this->log_pull();
460
461 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
462 $this->content_md5h = tx_indexedsearch_util::md5inthash(implode('', $this->contentParts));
463
464 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
465 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
466 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
467 $checkCHash = $this->checkContentHash();
468 if (!is_array($checkCHash) || $check===1) {
469 $Pstart=t3lib_div::milliseconds();
470
471 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
472 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
473 $this->log_pull();
474
475 // Splitting words
476 $this->log_push('Extract words from content','');
477 $splitInWords = $this->processWordsInArrays($this->contentParts);
478 $this->log_pull();
479
480 // Analyse the indexed words.
481 $this->log_push('Analyse the extracted words','');
482 $indexArr = $this->indexAnalyze($splitInWords);
483 $this->log_pull();
484
485 // Submitting page (phash) record
486 $this->log_push('Submitting page','');
487 $this->submitPage();
488 $this->log_pull();
489
490 // Check words and submit to word list if not there
491 $this->log_push('Check word list and submit words','');
492 if (tx_indexedsearch_util::isTableUsed('index_words')) {
493 $this->checkWordList($indexArr);
494 $this->submitWords($indexArr, $this->hash['phash']);
495 }
496 $this->log_pull();
497
498 // Set parsetime
499 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
500
501 // Checking external files if configured for.
502 $this->log_push('Checking external files','');
503 if ($this->conf['index_externals']) {
504 $this->extractLinks($this->conf['content']);
505 }
506 $this->log_pull();
507 } else {
508 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
509 $this->updateSetId($this->hash['phash']);
510 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
511 $this->updateRootline();
512 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
513 }
514 } else {
515 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
516 }
517 }
518
519 /**
520 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
521 *
522 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
523 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
524 * @see splitRegularContent()
525 */
526 function splitHTMLContent($content) {
527
528 // divide head from body ( u-ouh :) )
529 $contentArr = $this->defaultContentArray;
530 $contentArr['body'] = stristr($content,'<body');
531 $headPart = substr($content,0,-strlen($contentArr['body']));
532
533 // get title
534 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
535 $titleParts = explode(':',$contentArr['title'],2);
536 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
537
538 // get keywords and description metatags
539 if ($this->conf['index_metatags']) {
540 $meta = array();
541 $i = 0;
542 while ($this->embracingTags($headPart,'meta',$dummy,$headPart, $meta[$i])) {
543 $i++;
544 }
545 // TODO The code below stops at first unset tag. Is that correct?
546 for ($i = 0; isset($meta[$i]); $i++) {
547 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
548 if (stristr($meta[$i]['name'], 'keywords')) {
549 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
550 }
551 if (stristr($meta[$i]['name'], 'description')) {
552 $contentArr['description'] .= ',' . $meta[$i]['content'];
553 }
554 }
555 }
556
557 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
558 $this->typoSearchTags($contentArr['body']);
559
560 // Get rid of unwanted sections (ie. scripting and style stuff) in body
561 $tagList = explode(',',$this->excludeSections);
562 foreach($tagList as $tag) {
563 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
564 }
565
566 // remove tags, but first make sure we don't concatenate words by doing it
567 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
568 $contentArr['body'] = trim(strip_tags($contentArr['body']));
569
570 $contentArr['keywords'] = trim($contentArr['keywords']);
571 $contentArr['description'] = trim($contentArr['description']);
572
573 // Return array
574 return $contentArr;
575 }
576
577 /**
578 * Extract the charset value from HTML meta tag.
579 *
580 * @param string HTML content
581 * @return string The charset value if found.
582 */
583 function getHTMLcharset($content) {
584 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i',$content,$reg)) {
585 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i',$reg[0],$reg2)) {
586 return $reg2[1];
587 }
588 }
589 }
590
591 /**
592 * Converts a HTML document to utf-8
593 *
594 * @param string HTML content, any charset
595 * @param string Optional charset (otherwise extracted from HTML)
596 * @return string Converted HTML
597 */
598 function convertHTMLToUtf8($content,$charset='') {
599
600 // Find charset:
601 $charset = $charset ? $charset : $this->getHTMLcharset($content);
602 $charset = $this->csObj->parse_charset($charset);
603
604 // Convert charset:
605 if ($charset && $charset!=='utf-8') {
606 $content = $this->csObj->utf8_encode($content, $charset);
607 }
608 // Convert entities, assuming document is now UTF-8:
609 $content = $this->csObj->entities_to_utf8($content, TRUE);
610
611 return $content;
612 }
613
614 /**
615 * Finds first occurence of embracing tags and returns the embraced content and the original string with
616 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
617 * <title> of document or removing <script>-sections
618 *
619 * @param string String to search in
620 * @param string Tag name, eg. "script"
621 * @param string Passed by reference: Content inside found tag
622 * @param string Passed by reference: Content after found tag
623 * @param string Passed by reference: Attributes of the found tag.
624 * @return boolean Returns FALSE if tag was not found, otherwise TRUE.
625 */
626 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
627 $endTag = '</'.$tagName.'>';
628 $startTag = '<'.$tagName;
629
630 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
631 if(!$isTagInText) return FALSE; // if the tag was not found, return FALSE
632
633 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
634 $afterTagInText = stristr($isTagInText,$endTag);
635 if ($afterTagInText) {
636 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
637 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
638 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
639 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
640 $tagContent='';
641 $stringAfter = $isTagInText;
642 }
643
644 return TRUE;
645 }
646
647 /**
648 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
649 *
650 * @param string HTML Content, passed by reference
651 * @return boolean Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
652 */
653 function typoSearchTags(&$body) {
654 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
655
656 if(count($expBody)>1) {
657 $body = '';
658
659 foreach($expBody as $val) {
660 $part = explode('-->',$val,2);
661 if(trim($part[0])=='begin') {
662 $body.= $part[1];
663 $prev = '';
664 } elseif(trim($part[0])=='end') {
665 $body.= $prev;
666 } else {
667 $prev = $val;
668 }
669 }
670 return TRUE;
671 } else {
672 return FALSE;
673 }
674 }
675
676 /**
677 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
678 *
679 * @param string HTML content
680 * @return void
681 */
682 function extractLinks($content) {
683
684 // Get links:
685 $list = $this->extractHyperLinks($content);
686
687 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
688 $this->includeCrawlerClass();
689 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
690 }
691
692 // Traverse links:
693 foreach($list as $linkInfo) {
694
695 // Decode entities:
696 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
697 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
698 } else {
699 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
700 }
701
702 // Parse URL:
703 $qParts = parse_url($linkSource);
704
705 // Check for jumpurl (TYPO3 specific thing...)
706 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
707 parse_str($qParts['query'],$getP);
708 $linkSource = $getP['jumpurl'];
709 $qParts = parse_url($linkSource); // parse again due to new linkSource!
710 }
711
712 if (!$linkInfo['localPath'] && $qParts['scheme']) {
713 if ($this->indexerConfig['indexExternalURLs']) {
714 // Index external URL (http or otherwise)
715 $this->indexExternalUrl($linkSource);
716 }
717 } elseif (!$qParts['query']) {
718 $linkSource = urldecode($linkSource);
719 if (t3lib_div::isAllowedAbsPath($linkSource)) {
720 $localFile = $linkSource;
721 } else {
722 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
723 }
724 if ($localFile && @is_file($localFile)) {
725
726 // Index local file:
727 if ($linkInfo['localPath']) {
728
729 $fI = pathinfo($linkSource);
730 $ext = strtolower($fI['extension']);
731 if (is_object($crawler)) {
732 $params = array(
733 'document' => $linkSource,
734 'alturl' => $linkInfo['href'],
735 'conf' => $this->conf
736 );
737 unset($params['conf']['content']);
738
739 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
740 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
741 } else {
742 $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
743 }
744 } else {
745 if (is_object($crawler)) {
746 $params = array(
747 'document' => $linkSource,
748 'conf' => $this->conf
749 );
750 unset($params['conf']['content']);
751 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
752 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
753 } else {
754 $this->indexRegularDocument($linkSource);
755 }
756 }
757 }
758 }
759 }
760 }
761
762 /**
763 * Extracts all links to external documents from the HTML content string
764 *
765 * @param string $html
766 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
767 * @see extractLinks()
768 */
769 function extractHyperLinks($html) {
770 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
771 $htmlParts = $htmlParser->splitTags('a', $html);
772 $hyperLinksData = array();
773 foreach ($htmlParts as $index => $tagData) {
774 if (($index % 2) !== 0) {
775 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
776 $firstTagName = $htmlParser->getFirstTagName($tagData);
777
778 if (strtolower($firstTagName) == 'a') {
779 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
780 $hyperLinksData[] = array(
781 'tag' => $tagData,
782 'href' => $tagAttributes[0]['href'],
783 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
784 );
785 }
786 }
787 }
788 }
789
790 return $hyperLinksData;
791 }
792
793 /**
794 * Extracts the "base href" from content string.
795 *
796 * @param string Content to analyze
797 * @return string The base href or an empty string if not found
798 */
799 public function extractBaseHref($html) {
800 $href = '';
801 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
802 $htmlParts = $htmlParser->splitTags('base', $html);
803 foreach ($htmlParts as $index => $tagData) {
804 if (($index % 2) !== 0) {
805 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
806 $firstTagName = $htmlParser->getFirstTagName($tagData);
807 if (strtolower($firstTagName) == 'base') {
808 $href = $tagAttributes[0]['href'];
809 if ($href) {
810 break;
811 }
812 }
813 }
814 }
815
816 return $href;
817 }
818
819 /******************************************
820 *
821 * Indexing; external URL
822 *
823 ******************************************/
824
825 /**
826 * Index External URLs HTML content
827 *
828 * @param string URL, eg. "http://typo3.org/"
829 * @return void
830 * @see indexRegularDocument()
831 */
832 function indexExternalUrl($externalUrl) {
833
834 // Parse External URL:
835 $qParts = parse_url($externalUrl);
836 $fI = pathinfo($qParts['path']);
837 $ext = strtolower($fI['extension']);
838
839 // Get headers:
840 $urlHeaders = $this->getUrlHeaders($externalUrl);
841 if (stristr($urlHeaders['Content-Type'],'text/html')) {
842 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
843 if (strlen($content)) {
844
845 // Create temporary file:
846 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL');
847 if ($tmpFile) {
848 t3lib_div::writeFile($tmpFile, $content);
849
850 // Index that file:
851 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
852 unlink($tmpFile);
853 }
854 }
855 }
856 }
857
858 /**
859 * Getting HTTP request headers of URL
860 *
861 * @param string The URL
862 * @param integer Timeout (seconds?)
863 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
864 */
865 function getUrlHeaders($url) {
866 $content = t3lib_div::getUrl($url,2); // Try to get the headers only
867
868 if (strlen($content)) {
869 // Compile headers:
870 $headers = t3lib_div::trimExplode(LF,$content,1);
871 $retVal = array();
872 foreach($headers as $line) {
873 if (!strlen(trim($line))) {
874 break; // Stop at the first empty line (= end of header)
875 }
876
877 list($headKey, $headValue) = explode(':', $line, 2);
878 $retVal[$headKey] = $headValue;
879 }
880 return $retVal;
881 }
882 }
883
884
885
886 /**
887 * Checks if the file is local
888 *
889 * @param $sourcePath
890 * @return string Absolute path to file if file is local, else empty string
891 */
892 protected function createLocalPath($sourcePath) {
893 $localPath = '';
894 static $pathFunctions = array(
895 'createLocalPathFromT3vars',
896 'createLocalPathUsingAbsRefPrefix',
897 'createLocalPathUsingDomainURL',
898 'createLocalPathFromAbsoluteURL',
899 'createLocalPathFromRelativeURL'
900 );
901 foreach ($pathFunctions as $functionName) {
902 $localPath = $this->$functionName($sourcePath);
903 if ($localPath != '') {
904 break;
905 }
906 }
907 return $localPath;
908 }
909
910 /**
911 * Attempts to create a local file path from T3VARs. This is useful for
912 * various download extensions that hide actual file name but still want the
913 * file to be indexed.
914 *
915 * @param string $sourcePath
916 * @return string
917 */
918 protected function createLocalPathFromT3vars($sourcePath) {
919 $localPath = '';
920 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
921 if (is_array($indexLocalFiles)) {
922 $md5 = t3lib_div::shortMD5($sourcePath);
923 // Note: not using self::isAllowedLocalFile here because this method
924 // is allowed to index files outside of the web site (for example,
925 // protected downloads)
926 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
927 $localPath = $indexLocalFiles[$md5];
928 }
929 }
930 return $localPath;
931 }
932
933 /**
934 * Attempts to create a local file path by matching a current request URL.
935 *
936 * @param string $sourcePath
937 * @return string
938 */
939 protected function createLocalPathUsingDomainURL($sourcePath) {
940 $localPath = '';
941 $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
942 $baseURLLength = strlen($baseURL);
943 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
944 $sourcePath = substr($sourcePath, $baseURLLength);
945 $localPath = PATH_site . $sourcePath;
946 if (!self::isAllowedLocalFile($localPath)) {
947 $localPath = '';
948 }
949 }
950 return $localPath;
951 }
952
953 /**
954 * Attempts to create a local file path by matching absRefPrefix. This
955 * requires TSFE. If TSFE is missing, this function does nothing.
956 *
957 * @param string $sourcePath
958 * @return string
959 */
960 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
961 $localPath = '';
962 if ($GLOBALS['TSFE'] instanceof tslib_fe) {
963 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
964 $absRefPrefixLength = strlen($absRefPrefix);
965 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
966 $sourcePath = substr($sourcePath, $absRefPrefixLength);
967 $localPath = PATH_site . $sourcePath;
968 if (!self::isAllowedLocalFile($localPath)) {
969 $localPath = '';
970 }
971 }
972 }
973 return $localPath;
974 }
975
976 /**
977 * Attempts to create a local file path from the absolute URL without
978 * schema.
979 *
980 * @param string $sourcePath
981 * @return string
982 */
983 protected function createLocalPathFromAbsoluteURL($sourcePath) {
984 $localPath = '';
985 if ($sourcePath{0} == '/') {
986 $sourcePath = substr($sourcePath, 1);
987 $localPath = PATH_site . $sourcePath;
988 if (!self::isAllowedLocalFile($localPath)) {
989 $localPath = '';
990 }
991 }
992 return $localPath;
993 }
994
995 /**
996 * Attempts to create a local file path from the relative URL.
997 *
998 * @param string $sourcePath
999 * @return string
1000 */
1001 protected function createLocalPathFromRelativeURL($sourcePath) {
1002 $localPath = '';
1003 if (self::isRelativeURL($sourcePath)) {
1004 $localPath = PATH_site . $sourcePath;
1005 if (!self::isAllowedLocalFile($localPath)) {
1006 $localPath = '';
1007 }
1008 }
1009 return $localPath;
1010 }
1011
1012 /**
1013 * Checks if URL is relative.
1014 *
1015 * @param string $url
1016 * @return boolean
1017 */
1018 static protected function isRelativeURL($url) {
1019 $urlParts = @parse_url($url);
1020 return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
1021 }
1022
1023 /**
1024 * Checks if the path points to the file inside the web site
1025 *
1026 * @param string $filePath
1027 * @return boolean
1028 */
1029 static protected function isAllowedLocalFile($filePath) {
1030 $filePath = t3lib_div::resolveBackPath($filePath);
1031 $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
1032 $isFile = is_file($filePath);
1033 return $insideWebPath && $isFile;
1034 }
1035
1036 /******************************************
1037 *
1038 * Indexing; external files (PDF, DOC, etc)
1039 *
1040 ******************************************/
1041
1042 /**
1043 * Indexing a regular document given as $file (relative to PATH_site, local file)
1044 *
1045 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1046 * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
1047 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1048 * @param string File extension for temporary file.
1049 * @return void
1050 */
1051 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
1052
1053 // Init
1054 $fI = pathinfo($file);
1055 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
1056
1057 // Create abs-path:
1058 if (!$contentTmpFile) {
1059 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
1060 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
1061 } else { // Absolute, pass-through:
1062 $absFile = $file;
1063 }
1064 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
1065 } else {
1066 $absFile = $contentTmpFile;
1067 }
1068
1069 // Indexing the document:
1070 if ($absFile && @is_file($absFile)) {
1071 if ($this->external_parsers[$ext]) {
1072 $mtime = filemtime($absFile);
1073 $cParts = $this->fileContentParts($ext,$absFile);
1074
1075 foreach($cParts as $cPKey) {
1076 $this->internal_log = array();
1077 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
1078 $Pstart = t3lib_div::milliseconds();
1079 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1080 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
1081 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
1082 if ($check > 0 || $force) {
1083 if ($check > 0) {
1084 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
1085 } else {
1086 $this->log_setTSlogMessage('Indexing forced by flag',1);
1087 }
1088
1089 // Check external file counter:
1090 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1091
1092 // Divide into title,keywords,description and body:
1093 $this->log_push('Split content','');
1094 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
1095 $this->log_pull();
1096
1097 if (is_array($contentParts)) {
1098 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1099 $content_md5h = tx_indexedsearch_util::md5inthash(implode($contentParts,''));
1100
1101 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1102
1103 // Increment counter:
1104 $this->externalFileCounter++;
1105
1106 // Splitting words
1107 $this->log_push('Extract words from content','');
1108 $splitInWords = $this->processWordsInArrays($contentParts);
1109 $this->log_pull();
1110
1111 // Analyse the indexed words.
1112 $this->log_push('Analyse the extracted words','');
1113 $indexArr = $this->indexAnalyze($splitInWords);
1114 $this->log_pull();
1115
1116 // Submitting page (phash) record
1117 $this->log_push('Submitting page','');
1118 $size = filesize($absFile);
1119 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1120 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
1121 $this->log_pull();
1122
1123 // Check words and submit to word list if not there
1124 $this->log_push('Check word list and submit words','');
1125 if (tx_indexedsearch_util::isTableUsed('index_words')) {
1126 $this->checkWordList($indexArr);
1127 $this->submitWords($indexArr, $phash_arr['phash']);
1128 }
1129 $this->log_pull();
1130
1131 // Set parsetime
1132 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
1133 } else {
1134 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
1135 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
1136 }
1137 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1138 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
1139 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
1140
1141 // Checking and setting sections:
1142 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
1143 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1144 $this->log_pull();
1145 }
1146 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
1147 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
1148 }
1149
1150 /**
1151 * Reads the content of an external file being indexed.
1152 * The content from the external parser MUST be returned in utf-8!
1153 *
1154 * @param string File extension, eg. "pdf", "doc" etc.
1155 * @param string Absolute filename of file (must exist and be validated OK before calling function)
1156 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1157 * @return array Standard content array (title, description, keywords, body keys)
1158 */
1159 function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1160 $contentArray = NULL;
1161
1162 // Consult relevant external document parser:
1163 if (is_object($this->external_parsers[$fileExtension])) {
1164 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1165 }
1166
1167 return $contentArray;
1168 }
1169
1170 /**
1171 * Creates an array with pointers to divisions of document.
1172 *
1173 * @param string File extension
1174 * @param string Absolute filename (must exist and be validated OK before calling function)
1175 * @return array Array of pointers to sections that the document should be divided into
1176 */
1177 function fileContentParts($ext,$absFile) {
1178 $cParts = array(0);
1179
1180 // Consult relevant external document parser:
1181 if (is_object($this->external_parsers[$ext])) {
1182 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
1183 }
1184
1185 return $cParts;
1186 }
1187
1188 /**
1189 * Splits non-HTML content (from external files for instance)
1190 *
1191 * @param string Input content (non-HTML) to index.
1192 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1193 * @see splitHTMLContent()
1194 */
1195 function splitRegularContent($content) {
1196 $contentArr = $this->defaultContentArray;
1197 $contentArr['body'] = $content;
1198
1199 return $contentArr;
1200 }
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215 /**********************************
1216 *
1217 * Analysing content, Extracting words
1218 *
1219 **********************************/
1220
1221 /**
1222 * Convert character set and HTML entities in the value of input content array keys
1223 *
1224 * @param array Standard content array
1225 * @param string Charset of the input content (converted to utf-8)
1226 * @return void
1227 */
1228 function charsetEntity2utf8(&$contentArr, $charset) {
1229
1230 // Convert charset if necessary
1231 foreach ($contentArr as $key => $value) {
1232 if (strlen($contentArr[$key])) {
1233
1234 if ($charset!=='utf-8') {
1235 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1236 }
1237
1238 // decode all numeric / html-entities in the string to real characters:
1239 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
1240 }
1241 }
1242 }
1243
1244 /**
1245 * Processing words in the array from split*Content -functions
1246 *
1247 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
1248 * @return array Content input array modified so each key is not a unique array of words
1249 */
1250 function processWordsInArrays($contentArr) {
1251
1252 // split all parts to words
1253 foreach ($contentArr as $key => $value) {
1254 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1255 }
1256
1257 // For title, keywords, and description we don't want duplicates:
1258 $contentArr['title'] = array_unique($contentArr['title']);
1259 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1260 $contentArr['description'] = array_unique($contentArr['description']);
1261
1262 // Return modified array:
1263 return $contentArr;
1264 }
1265
1266 /**
1267 * Extracts the sample description text from the content array.
1268 *
1269 * @param array Content array
1270 * @return string Description string
1271 */
1272 function bodyDescription($contentArr) {
1273
1274 // Setting description
1275 $maxL = t3lib_utility_Math::forceIntegerInRange($this->conf['index_descrLgd'],0,255,200);
1276 if ($maxL) {
1277 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
1278 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
1279 $bodyDescription = str_replace(array(' ',TAB,CR,LF),' ',$contentArr['body']);
1280
1281 // Shorten the string:
1282 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1283 }
1284
1285 return $bodyDescription;
1286 }
1287
1288 /**
1289 * Analyzes content to use for indexing,
1290 *
1291 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1292 * @return array Index Array (whatever that is...)
1293 */
1294 function indexAnalyze($content) {
1295 $indexArr = Array();
1296 $counter = 0;
1297
1298 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
1299 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
1300 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
1301 $this->analyzeBody($indexArr,$content);
1302
1303 return $indexArr;
1304 }
1305
1306 /**
1307 * Calculates relevant information for headercontent
1308 *
1309 * @param array Index array, passed by reference
1310 * @param array Standard content array
1311 * @param string Key from standard content array
1312 * @param integer Bit-wise priority to type
1313 * @return void
1314 */
1315 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
1316 foreach ($content[$key] as $val) {
1317 $val = substr($val, 0, 60); // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1318
1319 if (!isset($retArr[$val])) {
1320 // Word ID (wid)
1321 $retArr[$val]['hash'] = tx_indexedsearch_util::md5inthash($val);
1322
1323 // Metaphone value is also 60 only chars long
1324 $metaphone = $this->enableMetaphoneSearch
1325 ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60)
1326 : '';
1327 $retArr[$val]['metaphone'] = $metaphone;
1328 }
1329
1330 // Build metaphone fulltext string (can be used for fulltext indexing)
1331 if ($this->storeMetaphoneInfoAsWords) {
1332 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1333 }
1334
1335 // Priority used for flagBitMask feature (see extension configuration)
1336 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
1337
1338 // Increase number of occurences
1339 $retArr[$val]['count']++;
1340 $this->wordcount++;
1341 }
1342 }
1343
1344 /**
1345 * Calculates relevant information for bodycontent
1346 *
1347 * @param array Index array, passed by reference
1348 * @param array Standard content array
1349 * @return void
1350 */
1351 function analyzeBody(&$retArr,$content) {
1352 foreach ($content['body'] as $key => $val) {
1353 $val = substr($val, 0, 60); // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1354
1355 if (!isset($retArr[$val])) {
1356 // First occurence (used for ranking results)
1357 $retArr[$val]['first'] = $key;
1358
1359 // Word ID (wid)
1360 $retArr[$val]['hash'] = tx_indexedsearch_util::md5inthash($val);
1361
1362 // Metaphone value is also only 60 chars long
1363 $metaphone = $this->enableMetaphoneSearch
1364 ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60)
1365 : '';
1366 $retArr[$val]['metaphone'] = $metaphone;
1367 }
1368
1369 // Build metaphone fulltext string (can be used for fulltext indexing)
1370 if ($this->storeMetaphoneInfoAsWords) {
1371 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1372 }
1373
1374 // Increase number of occurences
1375 $retArr[$val]['count']++;
1376 $this->wordcount++;
1377 }
1378 }
1379
1380 /**
1381 * Creating metaphone based hash from input word
1382 *
1383 * @param string Word to convert
1384 * @param boolean If set, returns the raw metaphone value (not hashed)
1385 * @return mixed Metaphone hash integer (or raw value, string)
1386 */
1387 function metaphone($word, $returnRawMetaphoneValue=FALSE) {
1388
1389 if (is_object($this->metaphoneObj)) {
1390 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1391 } else {
1392 // Use native PHP function instead of advanced doubleMetaphone class
1393 $metaphoneRawValue = metaphone($word);
1394 }
1395
1396 if ($returnRawMetaphoneValue) {
1397 $result = $metaphoneRawValue;
1398 } elseif (strlen($metaphoneRawValue)) {
1399 // Create hash and return integer
1400 $result = tx_indexedsearch_util::md5inthash($metaphoneRawValue);
1401 } else {
1402 $result = 0;
1403 }
1404
1405 return $result;
1406 }
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423 /********************************
1424 *
1425 * SQL; TYPO3 Pages
1426 *
1427 *******************************/
1428
1429 /**
1430 * Updates db with information about the page (TYPO3 page, not external media)
1431 *
1432 * @return void
1433 */
1434 function submitPage() {
1435
1436 // Remove any current data for this phash:
1437 $this->removeOldIndexedPages($this->hash['phash']);
1438
1439 // setting new phash_row
1440 $fields = array(
1441 'phash' => $this->hash['phash'],
1442 'phash_grouping' => $this->hash['phash_grouping'],
1443 'cHashParams' => serialize($this->cHashParams),
1444 'contentHash' => $this->content_md5h,
1445 'data_page_id' => $this->conf['id'],
1446 'data_page_reg1' => $this->conf['page_cache_reg1'],
1447 'data_page_type' => $this->conf['type'],
1448 'data_page_mp' => $this->conf['MP'],
1449 'gr_list' => $this->conf['gr_list'],
1450 'item_type' => 0, // TYPO3 page
1451 'item_title' => $this->contentParts['title'],
1452 'item_description' => $this->bodyDescription($this->contentParts),
1453 'item_mtime' => $this->conf['mtime'],
1454 'item_size' => strlen($this->conf['content']),
1455 'tstamp' => $GLOBALS['EXEC_TIME'],
1456 'crdate' => $GLOBALS['EXEC_TIME'],
1457 'item_crdate' => $this->conf['crdate'], // Creation date of page
1458 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display!
1459 'externalUrl' => 0,
1460 'recordUid' => intval($this->conf['recordUid']),
1461 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1462 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1463 );
1464
1465 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1466 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1467 }
1468
1469 // PROCESSING index_section
1470 $this->submit_section($this->hash['phash'],$this->hash['phash']);
1471
1472 // PROCESSING index_grlist
1473 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
1474
1475 // PROCESSING index_fulltext
1476 $fields = array(
1477 'phash' => $this->hash['phash'],
1478 'fulltextdata' => implode(' ', $this->contentParts),
1479 'metaphonedata' => $this->metaphoneContent
1480 );
1481 if ($this->indexerConfig['fullTextDataLength']>0) {
1482 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1483 }
1484 if (tx_indexedsearch_util::isTableUsed('index_fulltext')) {
1485 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1486 }
1487
1488 // PROCESSING index_debug
1489 if ($this->indexerConfig['debugMode']) {
1490 $fields = array(
1491 'phash' => $this->hash['phash'],
1492 'debuginfo' => serialize(array(
1493 'cHashParams' => $this->cHashParams,
1494 'external_parsers initialized' => array_keys($this->external_parsers),
1495 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
1496 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
1497 'logs' => $this->internal_log,
1498 'lexer' => $this->lexerObj->debugString,
1499 ))
1500 );
1501 if (tx_indexedsearch_util::isTableUsed('index_debug')) {
1502 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1503 }
1504 }
1505 }
1506
1507 /**
1508 * Stores gr_list in the database.
1509 *
1510 * @param integer Search result record phash
1511 * @param integer Actual phash of current content
1512 * @return void
1513 * @see update_grlist()
1514 */
1515 function submit_grlist($hash,$phash_x) {
1516
1517 // Setting the gr_list record
1518 $fields = array(
1519 'phash' => $hash,
1520 'phash_x' => $phash_x,
1521 'hash_gr_list' => tx_indexedsearch_util::md5inthash($this->conf['gr_list']),
1522 'gr_list' => $this->conf['gr_list']
1523 );
1524 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1525 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1526 }
1527 }
1528
1529 /**
1530 * Stores section
1531 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1532 *
1533 * @param integer phash of TYPO3 parent search result record
1534 * @param integer phash of the file indexation search record
1535 * @return void
1536 */
1537 function submit_section($hash,$hash_t3) {
1538 $fields = array(
1539 'phash' => $hash,
1540 'phash_t3' => $hash_t3,
1541 'page_id' => intval($this->conf['id'])
1542 );
1543
1544 $this->getRootLineFields($fields);
1545
1546 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1547 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1548 }
1549 }
1550
1551 /**
1552 * Removes records for the indexed page, $phash
1553 *
1554 * @param integer phash value to flush
1555 * @return void
1556 */
1557 function removeOldIndexedPages($phash) {
1558 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1559 $tableArray = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
1560 foreach ($tableArray as $table) {
1561 if (tx_indexedsearch_util::isTableUsed($table)) {
1562 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1563 }
1564 }
1565 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1566 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1567 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . intval($phash));
1568 }
1569 }
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583 /********************************
1584 *
1585 * SQL; External media
1586 *
1587 *******************************/
1588
1589
1590 /**
1591 * Updates db with information about the file
1592 *
1593 * @param array Array with phash and phash_grouping keys for file
1594 * @param string File name
1595 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1596 * @param string File extension determining the type of media.
1597 * @param integer Modification time of file.
1598 * @param integer Creation time of file.
1599 * @param integer Size of file in bytes
1600 * @param integer Content HASH value.
1601 * @param array Standard content array (using only title and body for a file)
1602 * @return void
1603 */
1604 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
1605
1606 // Find item Type:
1607 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1608 $storeItemType = $storeItemType ? $storeItemType : $ext;
1609
1610 // Remove any current data for this phash:
1611 $this->removeOldIndexedFiles($hash['phash']);
1612
1613 // Split filename:
1614 $fileParts = parse_url($file);
1615
1616 // Setting new
1617 $fields = array(
1618 'phash' => $hash['phash'],
1619 'phash_grouping' => $hash['phash_grouping'],
1620 'cHashParams' => serialize($subinfo),
1621 'contentHash' => $content_md5h,
1622 'data_filename' => $file,
1623 'item_type' => $storeItemType,
1624 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
1625 'item_description' => $this->bodyDescription($contentParts),
1626 'item_mtime' => $mtime,
1627 'item_size' => $size,
1628 'item_crdate' => $ctime,
1629 'tstamp' => $GLOBALS['EXEC_TIME'],
1630 'crdate' => $GLOBALS['EXEC_TIME'],
1631 'gr_list' => $this->conf['gr_list'],
1632 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1633 'recordUid' => intval($this->conf['recordUid']),
1634 'freeIndexUid' => intval($this->conf['freeIndexUid']),
1635 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
1636 );
1637 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1638 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1639 }
1640
1641 // PROCESSING index_fulltext
1642 $fields = array(
1643 'phash' => $hash['phash'],
1644 'fulltextdata' => implode(' ', $contentParts),
1645 'metaphonedata' => $this->metaphoneContent
1646 );
1647 if ($this->indexerConfig['fullTextDataLength']>0) {
1648 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
1649 }
1650 if (tx_indexedsearch_util::isTableUsed('index_fulltext')) {
1651 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1652 }
1653
1654 // PROCESSING index_debug
1655 if ($this->indexerConfig['debugMode']) {
1656 $fields = array(
1657 'phash' => $hash['phash'],
1658 'debuginfo' => serialize(array(
1659 'cHashParams' => $subinfo,
1660 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
1661 'logs' => $this->internal_log,
1662 'lexer' => $this->lexerObj->debugString,
1663 ))
1664 );
1665 if (tx_indexedsearch_util::isTableUsed('index_debug')) {
1666 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1667 }
1668 }
1669 }
1670
1671 /**
1672 * Stores file gr_list for a file IF it does not exist already
1673 *
1674 * @param integer phash value of file
1675 * @return void
1676 */
1677 function submitFile_grlist($hash) {
1678 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1679 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1680 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($hash) . ' AND (hash_gr_list=' . tx_indexedsearch_util::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . tx_indexedsearch_util::md5inthash($this->conf['gr_list']) . ')');
1681 if ($count == 0) {
1682 $this->submit_grlist($hash, $hash);
1683 }
1684 }
1685 }
1686
1687 /**
1688 * Stores file section for a file IF it does not exist
1689 *
1690 * @param integer phash value of file
1691 * @return void
1692 */
1693 function submitFile_section($hash) {
1694 // Testing if there is already a section
1695 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1696 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . intval($hash) . ' AND page_id=' . intval($this->conf['id']));
1697 if ($count == 0) {
1698 $this->submit_section($hash,$this->hash['phash']);
1699 }
1700 }
1701 }
1702
1703 /**
1704 * Removes records for the indexed page, $phash
1705 *
1706 * @param integer phash value to flush
1707 * @return void
1708 */
1709 function removeOldIndexedFiles($phash) {
1710 // Removing old registrations for tables.
1711 $tableArray = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
1712 foreach ($tableArray as $table) {
1713 if (tx_indexedsearch_util::isTableUsed($table)) {
1714 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($phash));
1715 }
1716 }
1717 }
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732 /********************************
1733 *
1734 * SQL Helper functions
1735 *
1736 *******************************/
1737
1738 /**
1739 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1740 * Return positive integer if the page needs to be indexed
1741 *
1742 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1743 * @param integer "phash" used to select any already indexed page to see what its mtime is.
1744 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1745 */
1746 function checkMtimeTstamp($mtime,$phash) {
1747 if (!tx_indexedsearch_util::isTableUsed('index_phash')) {
1748 // Not indexed (not in index_phash)
1749 $result = 4;
1750 }
1751 else {
1752 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . intval($phash));
1753
1754 // If there was an indexing of the page...:
1755 if ($row) {
1756 if ($this->tstamp_maxAge && ($row['tstamp'] + $this->tstamp_maxAge) < $GLOBALS['EXEC_TIME']) {
1757 // If max age is exceeded, index the page
1758 // The configured max-age was exceeded for the document and thus it's indexed.
1759 $result = 1;
1760 } else {
1761 if (!$this->tstamp_minAge || ($row['tstamp'] + $this->tstamp_minAge) < $GLOBALS['EXEC_TIME']) {
1762 // if minAge is not set or if minAge is exceeded, consider at mtime
1763 if ($mtime) {
1764 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1765 if ($row['item_mtime'] != $mtime) {
1766 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1767 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1768 $result = 2;
1769 } else {
1770 // mtime matched the document, so no changes detected and no content updated
1771 $result = -1;
1772 if ($this->tstamp_maxAge) {
1773 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1774 } else {
1775 $this->updateTstamp($phash);
1776 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
1777 }
1778 }
1779 } else {
1780 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1781 $result = 3;
1782 }
1783 } else {
1784 // The minimum age was not exceeded
1785 $result = -2;
1786 }
1787 }
1788 } else {
1789 // Page has never been indexed (is not represented in the index_phash table).
1790 $result = 4;
1791 }
1792 }
1793 return $result;
1794 }
1795
1796 /**
1797 * Check content hash in phash table
1798 *
1799 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1800 */
1801 function checkContentHash() {
1802 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1803 $result = TRUE;
1804 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1805 $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping='.intval($this->hash['phash_grouping']).' AND contentHash='.intval($this->content_md5h));
1806 if ($row) {
1807 $result = $row;
1808 }
1809 }
1810
1811 return $result;
1812 }
1813
1814 /**
1815 * Check content hash for external documents
1816 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1817 *
1818 * @param integer phash value to check (phash_grouping)
1819 * @param integer Content hash to check
1820 * @return boolean Returns TRUE if the document needs to be indexed (that is, there was no result)
1821 */
1822 function checkExternalDocContentHash($hashGr,$content_md5h) {
1823 $result = TRUE;
1824 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1825 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . intval($hashGr) . ' AND contentHash=' . intval($content_md5h));
1826 $result = ($count == 0);
1827 }
1828
1829 return $result;
1830 }
1831
1832 /**
1833 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1834 *
1835 * @param integer Phash integer to test.
1836 * @return boolean
1837 */
1838 function is_grlist_set($phash_x) {
1839 $result = FALSE;
1840 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1841 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . intval($phash_x));
1842 $result = ($count > 0);
1843 }
1844 return $result;
1845 }
1846
1847 /**
1848 * Check if an grlist-entry for this hash exists and if not so, write one.
1849 *
1850 * @param integer phash of the search result that should be found
1851 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1852 * @return void
1853 * @see submit_grlist()
1854 */
1855 function update_grlist($phash, $phash_x) {
1856 if (tx_indexedsearch_util::isTableUsed('index_grlist')) {
1857 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . intval($phash) . ' AND hash_gr_list=' . tx_indexedsearch_util::md5inthash($this->conf['gr_list']));
1858 if ($count == 0) {
1859 $this->submit_grlist($phash, $phash_x);
1860 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
1861 }
1862 }
1863 }
1864
1865 /**
1866 * Update tstamp for a phash row.
1867 *
1868 * @param integer phash value
1869 * @param integer If set, update the mtime field to this value.
1870 * @return void
1871 */
1872 function updateTstamp($phash, $mtime = 0) {
1873 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1874 $updateFields = array(
1875 'tstamp' => $GLOBALS['EXEC_TIME']
1876 );
1877 if ($mtime) {
1878 $updateFields['item_mtime'] = intval($mtime);
1879 }
1880 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1881 }
1882 }
1883
1884 /**
1885 * Update SetID of the index_phash record.
1886 *
1887 * @param integer phash value
1888 * @return void
1889 */
1890 function updateSetId($phash) {
1891 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1892 $updateFields = array(
1893 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
1894 );
1895 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1896 }
1897 }
1898
1899 /**
1900 * Update parsetime for phash row.
1901 *
1902 * @param integer phash value.
1903 * @param integer Parsetime value to set.
1904 * @return void
1905 */
1906 function updateParsetime($phash, $parsetime) {
1907 if (tx_indexedsearch_util::isTableUsed('index_phash')) {
1908 $updateFields = array(
1909 'parsetime' => intval($parsetime)
1910 );
1911 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . intval($phash), $updateFields);
1912 }
1913 }
1914
1915 /**
1916 * Update section rootline for the page
1917 *
1918 * @return void
1919 */
1920 function updateRootline() {
1921 if (tx_indexedsearch_util::isTableUsed('index_section')) {
1922 $updateFields = array();
1923 $this->getRootLineFields($updateFields);
1924 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . intval($this->conf['id']), $updateFields);
1925 }
1926 }
1927
1928 /**
1929 * Adding values for root-line fields.
1930 * rl0, rl1 and rl2 are standard. A hook might add more.
1931 *
1932 * @param array Field array, passed by reference
1933 * @return void
1934 */
1935 function getRootLineFields(array &$fieldArray) {
1936 $fieldArray['rl0'] = intval($this->conf['rootline_uids'][0]);
1937 $fieldArray['rl1'] = intval($this->conf['rootline_uids'][1]);
1938 $fieldArray['rl2'] = intval($this->conf['rootline_uids'][2]);
1939
1940 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1941 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1942 $fieldArray[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
1943 }
1944 }
1945 }
1946
1947 /**
1948 * Removes any indexed pages with userlogins which has the same contentHash
1949 * NOT USED anywhere inside this class!
1950 *
1951 * @return void
1952 */
1953 function removeLoginpagesWithContentHash() {
1954 if (tx_indexedsearch_util::isTableUsed('index_phash') && tx_indexedsearch_util::isTableUsed('index_grlist')) {
1955 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1956 A.phash=B.phash
1957 AND A.phash_grouping='.intval($this->hash['phash_grouping']) . '
1958 AND B.hash_gr_list!='.tx_indexedsearch_util::md5inthash($this->defaultGrList) . '
1959 AND A.contentHash='.intval($this->content_md5h));
1960 while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1961 $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1962 $this->removeOldIndexedPages($row['phash']);
1963 }
1964 $GLOBALS['TYPO3_DB']->sql_free_result($res);
1965 }
1966 }
1967
1968 /**
1969 * Includes the crawler class
1970 *
1971 * @return void
1972 */
1973 function includeCrawlerClass() {
1974 t3lib_div::requireOnce(t3lib_extMgm::extPath('crawler') . 'class.tx_crawler_lib.php');
1975 }
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986 /********************************
1987 *
1988 * SQL; Submitting words
1989 *
1990 *******************************/
1991
1992 /**
1993 * Adds new words to db
1994 *
1995 * @param array $wordListArray Word List array (where each word has information about position etc).
1996 * @return void
1997 */
1998 function checkWordList($wordListArray) {
1999 if (tx_indexedsearch_util::isTableUsed('index_words')) {
2000 if (count($wordListArray)) {
2001 $phashArray = array();
2002 foreach ($wordListArray as $value) {
2003 $phashArray[] = intval($value['hash']);
2004 }
2005 $cwl = implode(',', $phashArray);
2006 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
2007 if ($count != count($wordListArray)) {
2008 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
2009 $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
2010 while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
2011 unset($wordListArray[$row['baseword']]);
2012 }
2013 $GLOBALS['TYPO3_DB']->sql_free_result($res);
2014
2015 foreach ($wordListArray as $key => $val) {
2016 $insertFields = array(
2017 'wid' => $val['hash'],
2018 'baseword' => $key,
2019 'metaphone' => $val['metaphone']
2020 );
2021 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
2022 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
2023 }
2024 }
2025 }
2026
2027 }
2028 }
2029
2030 /**
2031 * Submits RELATIONS between words and phash
2032 *
2033 * @param array Word list array
2034 * @param integer phash value
2035 * @return void
2036 */
2037 function submitWords($wordList, $phash) {
2038 if (tx_indexedsearch_util::isTableUsed('index_rel')) {
2039 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . intval($phash));
2040
2041 foreach ($wordList as $val) {
2042 $insertFields = array(
2043 'phash' => $phash,
2044 'wid' => $val['hash'],
2045 'count' => $val['count'],
2046 'first' => $val['first'],
2047 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
2048 'flags' => ($val['cmp'] & $this->flagBitMask)
2049 );
2050
2051 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2052 }
2053 }
2054 }
2055
2056 /**
2057 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2058 * and back.
2059 *
2060 * @param double Frequency
2061 * @return integer Frequency in range.
2062 */
2063 function freqMap($freq) {
2064 $mapFactor = $this->freqMax*100*$this->freqRange;
2065 if ($freq < 1) {
2066 $newFreq = $freq*$mapFactor;
2067 $newFreq = $newFreq>$this->freqRange ? $this->freqRange : $newFreq;
2068 } else {
2069 $newFreq = $freq/$mapFactor;
2070 }
2071 return $newFreq;
2072 }
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084 /********************************
2085 *
2086 * Hashing
2087 *
2088 *******************************/
2089
2090 /**
2091 * Get search hash, T3 pages
2092 *
2093 * @return void
2094 */
2095 function setT3Hashes() {
2096
2097 // Set main array:
2098 $hArray = array(
2099 'id' => (integer)$this->conf['id'],
2100 'type' => (integer)$this->conf['type'],
2101 'sys_lang' => (integer)$this->conf['sys_language_uid'],
2102 'MP' => (string)$this->conf['MP'],
2103 'cHash' => $this->cHashParams
2104 );
2105
2106 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2107 $this->hash['phash_grouping'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2108
2109 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2110 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2111 $this->hash['phash'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2112 }
2113
2114 /**
2115 * Get search hash, external files
2116 *
2117 * @param string File name / path which identifies it on the server
2118 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2119 * @return array Array with "phash_grouping" and "phash" inside.
2120 */
2121 function setExtHashes($file,$subinfo=array()) {
2122 // Set main array:
2123 $hash = array();
2124 $hArray = array(
2125 'file' => $file,
2126 );
2127
2128 // Set grouping hash:
2129 $hash['phash_grouping'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2130
2131 // Add subinfo
2132 $hArray['subinfo'] = $subinfo;
2133 $hash['phash'] = tx_indexedsearch_util::md5inthash(serialize($hArray));
2134
2135 return $hash;
2136 }
2137
2138 /**
2139 * Calculates md5 integer hash. This is kept for the compatibility with
2140 * previous versions. Delegates actual call to tx_indexedsearch_util.
2141 *
2142 * @param string $stringToHash String to hash
2143 * @return int Integer intepretation of the md5 hash of input string.
2144 * @deprecated will be removed in 4.8.
2145 */
2146 function md5inthash($stringToHash) {
2147 t3lib_div::logDeprecatedFunction();
2148 return tx_indexedsearch_util::md5inthash($stringToHash);
2149 }
2150
2151 /*********************************
2152 *
2153 * Internal logging functions
2154 *
2155 *********************************/
2156
2157 /**
2158 * Push function wrapper for TT logging
2159 *
2160 * @param string Title to set
2161 * @param string Key (?)
2162 * @return void
2163 */
2164 function log_push($msg,$key) {
2165 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
2166 }
2167
2168 /**
2169 * Pull function wrapper for TT logging
2170 *
2171 * @return void
2172 */
2173 function log_pull() {
2174 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
2175 }
2176
2177 /**
2178 * Set log message function wrapper for TT logging
2179 *
2180 * @param string Message to set
2181 * @param integer Error number
2182 * @return void
2183 */
2184 function log_setTSlogMessage($msg, $errorNum=0) {
2185 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
2186 $this->internal_log[] = $msg;
2187 }
2188
2189
2190
2191
2192
2193
2194
2195
2196 /**************************
2197 *
2198 * tslib_fe hooks:
2199 *
2200 **************************/
2201
2202 /**
2203 * Makes sure that keywords are space-separated. This is impotant for their
2204 * proper displaying as a part of fulltext index.
2205 *
2206 * @param string $keywordList
2207 * @return string
2208 * @see http://bugs.typo3.org/view.php?id=1436
2209 */
2210 protected function addSpacesToKeywordList($keywordList) {
2211 $keywords = t3lib_div::trimExplode(',', $keywordList);
2212 return ' ' . implode(', ', $keywords) . ' ';
2213 }
2214 }
2215
2216
2217 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php'])) {
2218 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
2219 }
2220 ?>