[BUGFIX] XLIFF: Approved attribute is on wrong element
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.indexer.php
index f175ee3..95e950a 100755 (executable)
@@ -2,7 +2,7 @@
 /***************************************************************
 *  Copyright notice
 *
-*  (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
+*  (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
 *  All rights reserved
 *
 *  This script is part of the TYPO3 project. The TYPO3 project is
 /**
  * This class is a search indexer for TYPO3
  *
- * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
+ * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
  * Originally Christian Jul Jensen <christian@jul.net> helped as well.
  */
 /**
- * [CLASS/FUNCTION INDEX of SCRIPT]
- *
- *
- *
- *  135: class tx_indexedsearch_indexer
- *  198:     function hook_indexContent(&$pObj)
- *
- *              SECTION: Backend API
- *  283:     function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
- *  320:     function backend_setFreeIndexUid($freeIndexUid)
- *  337:     function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
- *
- *              SECTION: Initialization
- *  388:     function init()
- *  439:     function initializeExternalParsers()
- *
- *              SECTION: Indexing; TYPO3 pages (HTML content)
- *  480:     function indexTypo3PageContent()
- *  564:     function splitHTMLContent($content)
- *  610:     function getHTMLcharset($content)
- *  625:     function convertHTMLToUtf8($content,$charset='')
- *  653:     function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
- *  680:     function typoSearchTags(&$body)
- *  709:     function extractLinks($content)
- *  752:     function extractHyperLinks($string)
- *
- *              SECTION: Indexing; external URL
- *  804:     function indexExternalUrl($externalUrl)
- *  835:     function getUrlHeaders($url, $timeout = 2)
- *
- *              SECTION: Indexing; external files (PDF, DOC, etc)
- *  895:     function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
- * 1001:     function readFileContent($ext,$absFile,$cPKey)
- * 1018:     function fileContentParts($ext,$absFile)
- * 1036:     function splitRegularContent($content)
- *
- *              SECTION: Analysing content, Extracting words
- * 1069:     function charsetEntity2utf8(&$contentArr, $charset)
- * 1091:     function procesWordsInArrays($contentArr)
- * 1114:     function bodyDescription($contentArr)
- * 1135:     function indexAnalyze($content)
- * 1156:     function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
- * 1175:     function analyzeBody(&$retArr,$content)
- * 1195:     function metaphone($word,$retRaw=FALSE)
- *
- *              SECTION: SQL; TYPO3 Pages
- * 1237:     function submitPage()
- * 1306:     function submit_grlist($hash,$phash_x)
- * 1326:     function submit_section($hash,$hash_t3)
- * 1344:     function removeOldIndexedPages($phash)
- *
- *              SECTION: SQL; External media
- * 1387:     function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
- * 1445:     function submitFile_grlist($hash)
- * 1459:     function submitFile_section($hash)
- * 1473:     function removeOldIndexedFiles($phash)
- *
- *              SECTION: SQL Helper functions
- * 1509:     function checkMtimeTstamp($mtime,$phash)
- * 1545:     function checkContentHash()
- * 1562:     function checkExternalDocContentHash($hashGr,$content_md5h)
- * 1576:     function is_grlist_set($phash_x)
- * 1589:     function update_grlist($phash,$phash_x)
- * 1604:     function updateTstamp($phash,$mtime=0)
- * 1620:     function updateParsetime($phash,$parsetime)
- * 1633:     function updateRootline()
- * 1648:     function getRootLineFields(&$fieldArr)
- * 1667:     function removeLoginpagesWithContentHash()
- *
- *              SECTION: SQL; Submitting words
- * 1702:     function checkWordList($wl)
- * 1739:     function submitWords($wl,$phash)
- * 1763:     function freqMap($freq)
- *
- *              SECTION: Hashing
- * 1796:     function setT3Hashes()
- * 1822:     function setExtHashes($file,$subinfo=array())
- * 1846:     function md5inthash($str)
- * 1856:     function makeCHash($paramArray)
- *
- *              SECTION: Internal logging functions
- * 1898:     function log_push($msg,$key)
- * 1907:     function log_pull()
- * 1918:     function log_setTSlogMessage($msg, $errorNum=0)
- *
- * TOTAL FUNCTIONS: 55
- * (This index is automatically created/updated by the extension "extdeveval")
- *
- */
-
-
-require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
-
-
-/**
  * Indexing class for TYPO3 frontend
  *
- * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
+ * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
  * @package TYPO3
  * @subpackage tx_indexedsearch
  */
@@ -158,6 +63,9 @@ class tx_indexedsearch_indexer {
        var $tstamp_minAge = 0;         // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
        var $maxExternalFiles = 0;      // Max number of external files to index.
 
+       var $forceIndexing = FALSE;             // If TRUE, indexing is forced despite of hashes etc.
+       var $crawlerActive = FALSE;             // Set when crawler is detected (internal)
+
                // INTERNALS:
        var $defaultContentArray=array(
                'title' => '',
@@ -169,7 +77,7 @@ class tx_indexedsearch_indexer {
        var $externalFileCounter = 0;
 
        var $conf = array();            // Configuration set internally (see init functions for required keys and their meaning)
-       var $indexerConfig = array();   // Indexer configuration
+       var $indexerConfig = array();   // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
        var $hash = array();            // Hash array, contains phash and phash_grouping
        var $file_phash_arr = array();  // Hash array for files
        var $contentParts = array();    // Content of TYPO3 page
@@ -179,13 +87,30 @@ class tx_indexedsearch_indexer {
 
        var $cHashParams = array();     // cHashparams array
 
-       var $freqRange = 65000;
+       var $freqRange = 32000;
        var $freqMax = 0.1;
 
                // Objects:
-       var $csObj;                             // Charset class object , t3lib_cs
-       var $metaphoneObj;              // Metaphone object, if any
-       var $lexerObj;                  // Lexer object for word splitting
+       /**
+        * Charset class object
+        *
+        * @var t3lib_cs
+        */
+       var $csObj;
+
+       /**
+        * Metaphone object, if any
+        *
+        * @var user_DoubleMetaPhone
+        */
+       var $metaphoneObj;
+
+       /**
+        * Lexer object for word splitting
+        *
+        * @var tx_indexedsearch_lexer
+        */
+       var $lexerObj;
 
 
 
@@ -200,59 +125,76 @@ class tx_indexedsearch_indexer {
                        // Indexer configuration from Extension Manager interface:
                $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
 
+                       // Crawler activation:
+                       // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
+               if (t3lib_extMgm::isLoaded('crawler')
+                               && $pObj->applicationData['tx_crawler']['running']
+                               && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions']))        {
+
+                               // Setting simple log message:
+                       $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
+
+                               // Setting variables:
+                       $this->crawlerActive = TRUE;    // Crawler active flag
+                       $this->forceIndexing = TRUE;    // Force indexing despite timestamps etc.
+               }
+
                        // Determine if page should be indexed, and if so, configure and initialize indexer
                if ($pObj->config['config']['index_enable'])    {
                        $this->log_push('Index page','');
 
-                       if (!$indexerConfig['disableFrontendIndexing']) {
+                       if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
                                if (!$pObj->page['no_search'])  {
                                        if (!$pObj->no_cache)   {
+                                               if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content))       {
 
-                                                       // Setting up internal configuration from config array:
-                                               $this->conf = array();
-
-                                                       // Information about page for which the indexing takes place
-                                               $this->conf['id'] = $pObj->id;                          // Page id
-                                               $this->conf['type'] = $pObj->type;                      // Page type
-                                               $this->conf['sys_language_uid'] = $pObj->sys_language_uid;      // sys_language UID of the language of the indexing.
-                                               $this->conf['MP'] = $pObj->MP;                          // MP variable, if any (Mount Points)
-                                               $this->conf['gr_list'] = $pObj->gr_list;        // Group list
-
-                                               $this->conf['cHash'] = $pObj->cHash;                                    // cHash string for additional parameters
-                                               $this->conf['cHash_array'] = $pObj->cHash_array;                // Array of the additional parameters
-
-                                               $this->conf['crdate'] = $pObj->page['crdate'];                  // The creation date of the TYPO3 page
-                                               $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;        // reg1 of the caching table. Not known what practical use this has.
-
-                                                       // Root line uids
-                                               $this->conf['rootline_uids'] = array();
-                                               foreach($pObj->config['rootLine'] as $rlkey => $rldat)  {
-                                                       $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
-                                               }
+                                                               // Setting up internal configuration from config array:
+                                                       $this->conf = array();
 
-                                                       // Content of page:
-                                               $this->conf['content'] = $pObj->content;                                        // Content string (HTML of TYPO3 page)
-                                               $this->conf['indexedDocTitle'] = $pObj->indexedDocTitle;        // Alternative title for indexing
-                                               $this->conf['metaCharset'] = $pObj->metaCharset;                        // Character set of content (will be converted to utf-8 during indexing)
-                                               $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];      // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
+                                                               // Information about page for which the indexing takes place
+                                                       $this->conf['id'] = $pObj->id;                          // Page id
+                                                       $this->conf['type'] = $pObj->type;                      // Page type
+                                                       $this->conf['sys_language_uid'] = $pObj->sys_language_uid;      // sys_language UID of the language of the indexing.
+                                                       $this->conf['MP'] = $pObj->MP;                          // MP variable, if any (Mount Points)
+                                                       $this->conf['gr_list'] = $pObj->gr_list;        // Group list
 
-                                                       // Configuration of behavior:
-                                               $this->conf['index_externals'] = $pObj->config['config']['index_externals'];    // Whether to index external documents like PDF, DOC etc. (if possible)
-                                               $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];              // Length of description text (max 250, default 200)
+                                                       $this->conf['cHash'] = $pObj->cHash;                                    // cHash string for additional parameters
+                                                       $this->conf['cHash_array'] = $pObj->cHash_array;                // Array of the additional parameters
 
-                                                       // Set to zero:
-                                               $this->conf['recordUid'] = 0;
-                                               $this->conf['freeIndexUid'] = 0;
+                                                       $this->conf['crdate'] = $pObj->page['crdate'];                  // The creation date of the TYPO3 page
+                                                       $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;        // reg1 of the caching table. Not known what practical use this has.
 
-                                                       // Init and start indexing:
-                                               $this->init();
-                                               $this->indexTypo3PageContent();
+                                                               // Root line uids
+                                                       $this->conf['rootline_uids'] = array();
+                                                       foreach($pObj->config['rootLine'] as $rlkey => $rldat)  {
+                                                               $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
+                                                       }
 
+                                                               // Content of page:
+                                                       $this->conf['content'] = $pObj->content;                                        // Content string (HTML of TYPO3 page)
+                                                       $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);      // Alternative title for indexing
+                                                       $this->conf['metaCharset'] = $pObj->metaCharset;                        // Character set of content (will be converted to utf-8 during indexing)
+                                                       $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];      // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
+
+                                                               // Configuration of behavior:
+                                                       $this->conf['index_externals'] = $pObj->config['config']['index_externals'];    // Whether to index external documents like PDF, DOC etc. (if possible)
+                                                       $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];              // Length of description text (max 250, default 200)
+                                                       $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
+
+                                                               // Set to zero:
+                                                       $this->conf['recordUid'] = 0;
+                                                       $this->conf['freeIndexUid'] = 0;
+                                                       $this->conf['freeIndexSetId'] = 0;
+
+                                                               // Init and start indexing:
+                                                       $this->init();
+                                                       $this->indexTypo3PageContent();
+                                               } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
                                        } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
-                               } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page header!');
+                               } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
                        } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
+                       $this->log_pull();
                }
-               $this->log_pull();
        }
 
 
@@ -293,11 +235,12 @@ class tx_indexedsearch_indexer {
                $this->conf['gr_list'] = '0,-1';        // Group list (hardcoded for now...)
 
                        // cHash values:
-               $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : '';      // cHash string for additional parameters
+               $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : '';   // cHash string for additional parameters
                $this->conf['cHash_array'] = $cHash_array;              // Array of the additional parameters
 
                        // Set to defaults
                $this->conf['freeIndexUid'] = 0;
+               $this->conf['freeIndexSetId'] = 0;
                $this->conf['page_cache_reg1'] = '';
 
                        // Root line uids
@@ -306,6 +249,7 @@ class tx_indexedsearch_indexer {
                        // Configuration of behavior:
                $this->conf['index_externals'] = 1;     // Whether to index external documents like PDF, DOC etc. (if possible)
                $this->conf['index_descrLgd'] = 200;            // Length of description text (max 250, default 200)
+               $this->conf['index_metatags'] = TRUE;   // Whether to index document keywords and description (if present)
 
                        // Init and start indexing:
                $this->init();
@@ -315,10 +259,12 @@ class tx_indexedsearch_indexer {
         * Sets the free-index uid. Can be called right after backend_initIndexer()
         *
         * @param       integer         Free index UID
+        * @param       integer         Set id - an integer identifying the "set" of indexing operations.
         * @return      void
         */
-       function backend_setFreeIndexUid($freeIndexUid) {
+       function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)      {
                $this->conf['freeIndexUid'] = $freeIndexUid;
+               $this->conf['freeIndexSetId'] = $freeIndexSetId;
        }
 
        /**
@@ -328,7 +274,7 @@ class tx_indexedsearch_indexer {
         * @param       string          Keywords equivalent
         * @param       string          Description equivalent
         * @param       string          The main content to index
-        * @param       string          The charset of the title, keyword, description and body-content
+        * @param       string          The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
         * @param       integer         Last modification time, in seconds
         * @param       integer         The creation date of the content, in seconds
         * @param       integer         The record UID that the content comes from (for registration with the indexed rows)
@@ -400,10 +346,10 @@ class tx_indexedsearch_indexer {
 
                        // Indexer configuration from Extension Manager interface:
                $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
-               $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
-               $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
-               $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
-               $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
+               $this->tstamp_minAge = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['minAge']*3600,0);
+               $this->tstamp_maxAge = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['maxAge']*3600,0);
+               $this->maxExternalFiles = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
+               $this->flagBitMask = t3lib_utility_Math::forceIntegerInRange($this->indexerConfig['flagBitMask'],0,255);
 
                        // Initialize external document parsers:
                        // Example configuration, see ext_localconf.php of this file!
@@ -416,17 +362,18 @@ class tx_indexedsearch_indexer {
                $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
                                                $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
                                                'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
-               $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
+               $this->lexerObj = t3lib_div::getUserObj($lexerObjRef);
                $this->lexerObj->debug = $this->indexerConfig['debugMode'];
 
                        // Initialize metaphone hook:
                        // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
                if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
-                       $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
+                       $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
+                       $this->metaphoneObj->pObj = $this;
                }
 
                        // Init charset class:
-               $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
+               $this->csObj = t3lib_div::makeInstance('t3lib_cs');
        }
 
        /**
@@ -441,10 +388,10 @@ class tx_indexedsearch_indexer {
 
                if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers']))        {
                        foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef)    {
-                               $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
-                               $this->external_parsers[$extension]->pObj = &$this;
+                               $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef);
+                               $this->external_parsers[$extension]->pObj = $this;
 
-                                       // Init parser and if it returns false, unset its entry again:
+                                       // Init parser and if it returns FALSE, unset its entry again:
                                if (!$this->external_parsers[$extension]->initParser($extension))       {
                                        unset($this->external_parsers[$extension]);
                                }
@@ -482,10 +429,12 @@ class tx_indexedsearch_indexer {
                $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
                $is_grlist = $this->is_grlist_set($this->hash['phash']);
 
-               if ($check > 0 || !$is_grlist)  {
+               if ($check > 0 || !$is_grlist || $this->forceIndexing)  {
 
                                // Setting message:
-                       if ($check > 0) {
+                       if ($this->forceIndexing)       {
+                               $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
+                       } elseif ($check > 0)   {
                                $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
                        } else {
                                $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
@@ -515,7 +464,7 @@ class tx_indexedsearch_indexer {
 
                                                // Splitting words
                                $this->log_push('Extract words from content','');
-                                       $splitInWords = $this->procesWordsInArrays($this->contentParts);
+                                       $splitInWords = $this->processWordsInArrays($this->contentParts);
                                $this->log_pull();
 
                                                // Analyse the indexed words.
@@ -545,6 +494,7 @@ class tx_indexedsearch_indexer {
                                $this->log_pull();
                        } else {
                                $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
+                               $this->updateSetId($this->hash['phash']);
                                $this->update_grlist($checkCHash['phash'],$this->hash['phash']);        // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
                                $this->updateRootline();
                                $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
@@ -574,11 +524,17 @@ class tx_indexedsearch_indexer {
                $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
 
                        // get keywords and description metatags
-               for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
-               for($i=0;isset($meta[$i]);$i++) {
-                       $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
-                       if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
-                       if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
+               if($this->conf['index_metatags']) {
+                       for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
+                       for($i=0;isset($meta[$i]);$i++) {
+                               $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
+                               if (stristr($meta[$i]['name'], 'keywords')) {
+                                       $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
+                               }
+                               if (stristr($meta[$i]['name'], 'description')) {
+                                       $contentArr['description'] .= ',' . $meta[$i]['content'];
+                               }
+                       }
                }
 
                        // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
@@ -608,8 +564,8 @@ class tx_indexedsearch_indexer {
         * @return      string          The charset value if found.
         */
        function getHTMLcharset($content)       {
-               if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg))       {
-                       if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2))     {
+               if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i',$content,$reg))       {
+                       if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i',$reg[0],$reg2))     {
                                return $reg2[1];
                        }
                }
@@ -640,7 +596,7 @@ class tx_indexedsearch_indexer {
 
        /**
         * Finds first occurence of embracing tags and returns the embraced content and the original string with
-        * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
+        * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
         * <title> of document or removing <script>-sections
         *
         * @param       string          String to search in
@@ -648,14 +604,14 @@ class tx_indexedsearch_indexer {
         * @param       string          Passed by reference: Content inside found tag
         * @param       string          Passed by reference: Content after found tag
         * @param       string          Passed by reference: Attributes of the found tag.
-        * @return      boolean         Returns false if tag was not found, otherwise true.
+        * @return      boolean         Returns FALSE if tag was not found, otherwise TRUE.
         */
        function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
                $endTag = '</'.$tagName.'>';
                $startTag = '<'.$tagName;
 
                $isTagInText = stristr($string,$startTag);              // stristr used because we want a case-insensitive search for the tag.
-               if(!$isTagInText) return false; // if the tag was not found, return false
+               if(!$isTagInText) return FALSE; // if the tag was not found, return FALSE
 
                list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
                $afterTagInText = stristr($isTagInText,$endTag);
@@ -668,17 +624,17 @@ class tx_indexedsearch_indexer {
                        $stringAfter = $isTagInText;
                }
 
-               return true;
+               return TRUE;
        }
 
        /**
         * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
         *
         * @param       string          HTML Content, passed by reference
-        * @return      boolean         Returns true if a TYPOSEARCH_ tag was found, otherwise false.
+        * @return      boolean         Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
         */
        function typoSearchTags(&$body) {
-               $expBody = explode('<!--TYPO3SEARCH_',$body);
+               $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
 
                if(count($expBody)>1) {
                        $body = '';
@@ -694,9 +650,9 @@ class tx_indexedsearch_indexer {
                                        $prev = $val;
                                }
                        }
-                       return true;
+                       return TRUE;
                } else {
-                       return false;
+                       return FALSE;
                }
        }
 
@@ -711,11 +667,20 @@ class tx_indexedsearch_indexer {
                        // Get links:
                $list = $this->extractHyperLinks($content);
 
+               if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler'))    {
+                       $this->includeCrawlerClass();
+                       $crawler = t3lib_div::makeInstance('tx_crawler_lib');
+               }
+
                        // Traverse links:
                foreach($list as $linkInfo)     {
 
                                // Decode entities:
-                       $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
+                       if ($linkInfo['localPath'])     {       // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
+                               $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
+                       } else {
+                               $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
+                       }
 
                                // Parse URL:
                        $qParts = parse_url($linkSource);
@@ -727,66 +692,112 @@ class tx_indexedsearch_indexer {
                                $qParts = parse_url($linkSource);       // parse again due to new linkSource!
                        }
 
-                       if ($qParts['scheme'])  {
+                       if (!$linkInfo['localPath'] && $qParts['scheme']) {
                                if ($this->indexerConfig['indexExternalURLs'])  {
                                                // Index external URL (http or otherwise)
                                        $this->indexExternalUrl($linkSource);
                                }
                        } elseif (!$qParts['query']) {
-                               $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
+                               $linkSource = urldecode($linkSource);
+                               if (t3lib_div::isAllowedAbsPath($linkSource))   {
+                                       $localFile = $linkSource;
+                               } else {
+                                       $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
+                               }
                                if ($localFile && @is_file($localFile)) {
+
                                                // Index local file:
-                                       $this->indexRegularDocument($linkSource);
+                                       if ($linkInfo['localPath'])     {
+
+                                               $fI = pathinfo($linkSource);
+                                               $ext = strtolower($fI['extension']);
+                                               if (is_object($crawler))        {
+                                                       $params = array(
+                                                               'document' => $linkSource,
+                                                               'alturl' => $linkInfo['href'],
+                                                               'conf' => $this->conf
+                                                       );
+                                                       unset($params['conf']['content']);
+
+                                                       $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
+                                                       $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
+                                               } else {
+                                                       $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
+                                               }
+                                       } else {
+                                               if (is_object($crawler))        {
+                                                       $params = array(
+                                                               'document' => $linkSource,
+                                                               'conf' => $this->conf
+                                                       );
+                                                       unset($params['conf']['content']);
+                                                       $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
+                                                       $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
+                                               } else {
+                                                       $this->indexRegularDocument($linkSource);
+                                               }
+                                       }
                                }
                        }
                }
        }
 
        /**
-        * Extracts all links to external documents from content string.
+        * Extracts all links to external documents from the HTML content string
         *
-        * @param       string          Content to analyse
-        * @return      array           Array of hyperlinks
+        * @param string $html
+        * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
         * @see extractLinks()
         */
-       function extractHyperLinks($string)     {
-               if (!is_object($this->htmlParser))      {
-                       $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
-               }
-
-               $parts = $this->htmlParser->splitTags('a',$string);
-               $list = array();
-               foreach($parts as $k => $v)     {
-                       if ($k%2)       {
-                               $params = $this->htmlParser->get_tag_attributes($v,1);
-                               $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
-
-                               switch(strtolower($firstTagName))       {
-                                       case 'a':
-                                               $src = $params[0]['href'];
-                                               if ($src)       {
-                                                       $list[] = array(
-                                                               'tag' => $v,
-                                                               'href' => $params[0]['href']
+       function extractHyperLinks($html)       {
+               $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
+               $htmlParts = $htmlParser->splitTags('a', $html);
+               $hyperLinksData = array();
+               foreach ($htmlParts as $index => $tagData) {
+                       if (($index % 2) !== 0) {
+                               $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
+                               $firstTagName = $htmlParser->getFirstTagName($tagData);
+
+                               if (strtolower($firstTagName) == 'a') {
+                                       if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
+                                               $hyperLinksData[] = array(
+                                                       'tag' => $tagData,
+                                                       'href' => $tagAttributes[0]['href'],
+                                                       'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
                                                        );
                                                }
-                                       break;
                                }
                        }
                }
 
-               return $list;
+               return $hyperLinksData;
        }
 
+       /**
+        * Extracts the "base href" from content string.
+        *
+        * @param       string          Content to analyze
+        * @return      string          The base href or an empty string if not found
+        */
+       public function extractBaseHref($html) {
+               $href = '';
+               $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
+               $htmlParts = $htmlParser->splitTags('base', $html);
+               foreach ($htmlParts as $index => $tagData) {
+                       if (($index % 2) !== 0) {
+                               $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
+                               $firstTagName = $htmlParser->getFirstTagName($tagData);
+                               if (strtolower($firstTagName) == 'base') {
+                                       $href = $tagAttributes[0]['href'];
+                                               if ($href) {
+                                               break;
+                                               }
+                               }
+                       }
+               }
 
-
-
-
-
-
-
-
-
+               return $href;
+       }
 
        /******************************************
         *
@@ -815,12 +826,14 @@ class tx_indexedsearch_indexer {
                        if (strlen($content))   {
 
                                        // Create temporary file:
-                               $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
-                               t3lib_div::writeFile($tmpFile, $content);
+                               $tmpFile = t3lib_div::tempnam('EXTERNAL_URL');
+                               if ($tmpFile) {
+                                       t3lib_div::writeFile($tmpFile, $content);
 
-                                       // Index that file:
-                               $this->indexRegularDocument($externalUrl, FALSE, $tmpFile, 'html');
-                               unlink($tmpFile);
+                                               // Index that file:
+                                       $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');      // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
+                                       unlink($tmpFile);
+                               }
                        }
                }
        }
@@ -830,34 +843,20 @@ class tx_indexedsearch_indexer {
         *
         * @param       string          The URL
         * @param       integer         Timeout (seconds?)
-        * @return      mixed           If no answer, returns false. Otherwise an array where HTTP headers are keys
+        * @return      mixed           If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
         */
-       function getUrlHeaders($url, $timeout = 2)      {
-               $url = parse_url($url);
-
-               if(!in_array($url['scheme'],array('','http')))  return FALSE;
-
-               $fp = fsockopen ($url['host'], ($url['port'] > 0 ? $url['port'] : 80), $errno, $errstr, $timeout);
-               if (!$fp)       {
-                       return FALSE;
-               } else {
-                       $msg = "GET ".$url['path'].($url['query'] ? '?'.$url['query'] : '')." HTTP/1.0\r\nHost: ".$url['host']."\r\n\r\n";
-                       fputs ($fp, $msg);
-                       $d = '';
-                       while (!feof($fp)) {
-                               $line = fgets ($fp,2048);
-
-                               $d.=$line;
-                               if (!strlen(trim($line)))       {
-                                       break;
-                               }
-                       }
-                       fclose ($fp);
+       function getUrlHeaders($url)    {
+               $content = t3lib_div::getUrl($url,2);   // Try to get the headers only
 
+               if (strlen($content))   {
                                // Compile headers:
-                       $headers = t3lib_div::trimExplode(chr(10),$d,1);
+                       $headers = t3lib_div::trimExplode(LF,$content,1);
                        $retVal = array();
                        foreach($headers as $line)      {
+                               if (!strlen(trim($line)))       {
+                                       break;  // Stop at the first empty line (= end of header)
+                               }
+
                                list($headKey, $headValue) = explode(':', $line, 2);
                                $retVal[$headKey] = $headValue;
                        }
@@ -867,15 +866,155 @@ class tx_indexedsearch_indexer {
 
 
 
+       /**
+        * Checks if the file is local
+        *
+        * @param $sourcePath
+        * @return string Absolute path to file if file is local, else empty string
+        */
+       protected function createLocalPath($sourcePath) {
+               $localPath = '';
+               static $pathFunctions = array(
+                       'createLocalPathFromT3vars',
+                       'createLocalPathUsingAbsRefPrefix',
+                       'createLocalPathUsingDomainURL',
+                       'createLocalPathFromAbsoluteURL',
+                       'createLocalPathFromRelativeURL'
+                       );
+               foreach ($pathFunctions as $functionName) {
+                       $localPath = $this->$functionName($sourcePath);
+                       if ($localPath != '') {
+                               break;
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path from T3VARs. This is useful for
+        * various download extensions that hide actual file name but still want the
+        * file to be indexed.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathFromT3vars($sourcePath) {
+               $localPath = '';
+               $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
+               if (is_array($indexLocalFiles)) {
+                       $md5 = t3lib_div::shortMD5($sourcePath);
+                       // Note: not using self::isAllowedLocalFile here because this method
+                       // is allowed to index files outside of the web site (for example,
+                       // protected downloads)
+                       if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
+                               $localPath = $indexLocalFiles[$md5];
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path by matching a current request URL.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathUsingDomainURL($sourcePath) {
+               $localPath = '';
+               $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
+               $baseURLLength = strlen($baseURL);
+               if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
+                       $sourcePath = substr($sourcePath, $baseURLLength);
+                       $localPath = PATH_site . $sourcePath;
+                       if (!self::isAllowedLocalFile($localPath)) {
+                               $localPath = '';
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path by matching absRefPrefix. This
+        * requires TSFE. If TSFE is missing, this function does nothing.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
+               $localPath = '';
+               if ($GLOBALS['TSFE'] instanceof tslib_fe) {
+                       $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
+                       $absRefPrefixLength = strlen($absRefPrefix);
+                       if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
+                               $sourcePath = substr($sourcePath, $absRefPrefixLength);
+                               $localPath = PATH_site . $sourcePath;
+                               if (!self::isAllowedLocalFile($localPath)) {
+                                       $localPath = '';
+                               }
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path from the absolute URL without
+        * schema.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathFromAbsoluteURL($sourcePath) {
+               $localPath = '';
+               if ($sourcePath{0} == '/') {
+                       $sourcePath = substr($sourcePath, 1);
+                       $localPath = PATH_site . $sourcePath;
+                       if (!self::isAllowedLocalFile($localPath)) {
+                               $localPath = '';
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path from the relative URL.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathFromRelativeURL($sourcePath) {
+               $localPath = '';
+               if (self::isRelativeURL($sourcePath)) {
+                       $localPath = PATH_site . $sourcePath;
+                       if (!self::isAllowedLocalFile($localPath)) {
+                               $localPath = '';
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Checks if URL is relative.
+        *
+        * @param string $url
+        * @return boolean
+        */
+       static protected function isRelativeURL($url) {
+               $urlParts = @parse_url($url);
+               return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
+       }
 
-
-
+       /**
+        * Checks if the path points to the file inside the web site
+        *
+        * @param string $filePath
+        * @return boolean
+        */
+       static protected function isAllowedLocalFile($filePath) {
+               $filePath = t3lib_div::resolveBackPath($filePath);
+               $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
+               $isFile = is_file($filePath);
+               return $insideWebPath && $isFile;
+       }
 
        /******************************************
         *
@@ -911,7 +1050,7 @@ class tx_indexedsearch_indexer {
                }
 
                        // Indexing the document:
-               if ($absFile &&  @is_file($absFile))    {
+               if ($absFile && @is_file($absFile))     {
                        if ($this->external_parsers[$ext])      {
                                $mtime = filemtime($absFile);
                                $cParts = $this->fileContentParts($ext,$absFile);
@@ -949,7 +1088,7 @@ class tx_indexedsearch_indexer {
 
                                                                                // Splitting words
                                                                        $this->log_push('Extract words from content','');
-                                                                               $splitInWords = $this->procesWordsInArrays($contentParts);
+                                                                               $splitInWords = $this->processWordsInArrays($contentParts);
                                                                        $this->log_pull();
 
                                                                                // Analyse the indexed words.
@@ -1069,14 +1208,14 @@ class tx_indexedsearch_indexer {
        function charsetEntity2utf8(&$contentArr, $charset)     {
 
                        // Convert charset if necessary
-               reset($contentArr);
-               while(list($key,)=each($contentArr)) {
+               foreach ($contentArr as $key => $value) {
                        if (strlen($contentArr[$key]))  {
+
                                if ($charset!=='utf-8') {
                                        $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
                                }
 
-                                       // decode all numeric / html-entitiesin in the string to real characters:
+                                       // decode all numeric / html-entities in the string to real characters:
                                $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
                        }
                }
@@ -1088,11 +1227,10 @@ class tx_indexedsearch_indexer {
         * @param       array           Array of content to index, see splitHTMLContent() and splitRegularContent()
         * @return      array           Content input array modified so each key is not a unique array of words
         */
-       function procesWordsInArrays($contentArr)       {
+       function processWordsInArrays($contentArr)      {
 
                        // split all parts to words
-               reset($contentArr);
-               while(list($key,)=each($contentArr)) {
+               foreach ($contentArr as $key => $value) {
                        $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
                }
 
@@ -1114,10 +1252,11 @@ class tx_indexedsearch_indexer {
        function bodyDescription($contentArr)   {
 
                        // Setting description
-               $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
+               $maxL = t3lib_utility_Math::forceIntegerInRange($this->conf['index_descrLgd'],0,255,200);
                if ($maxL)      {
                                // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
-                       $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
+       #               $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
+                       $bodyDescription = str_replace(array(' ',TAB,CR,LF),' ',$contentArr['body']);
 
                                // Shorten the string:
                        $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
@@ -1154,8 +1293,7 @@ class tx_indexedsearch_indexer {
         * @return      void
         */
        function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
-               reset($content[$key]);
-               while(list(,$val)=each($content[$key]))  {
+               foreach ($content[$key] as $val) {
                        $val = substr($val,0,60);       // Max 60 - because the baseword varchar IS 60. This MUST be the same.
                        $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
                        $retArr[$val]['count'] = $retArr[$val]['count']+1;
@@ -1195,7 +1333,7 @@ class tx_indexedsearch_indexer {
        function metaphone($word,$retRaw=FALSE) {
 
                if (is_object($this->metaphoneObj))     {
-                       $tmp = $this->metaphoneObj->metaphone($word);
+                       $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
                } else {
                        $tmp = metaphone($word);
                }
@@ -1204,7 +1342,11 @@ class tx_indexedsearch_indexer {
                if ($retRaw)    return $tmp;
 
                        // Otherwise create hash and return integer
-               if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
+               if ($tmp == '') {
+                       $ret = 0;
+               } else {
+                       $ret = hexdec(substr(md5($tmp), 0, 7));
+               }
                return $ret;
        }
 
@@ -1255,14 +1397,16 @@ class tx_indexedsearch_indexer {
                        'item_description' => $this->bodyDescription($this->contentParts),
                        'item_mtime' => $this->conf['mtime'],
                        'item_size' => strlen($this->conf['content']),
-                       'tstamp' => time(),
-                       'crdate' => time(),
+                       'tstamp' => $GLOBALS['EXEC_TIME'],
+                       'crdate' => $GLOBALS['EXEC_TIME'],
                        'item_crdate' => $this->conf['crdate'], // Creation date of page
                        'sys_language_uid' => $this->conf['sys_language_uid'],  // Sys language uid of the page. Should reflect which language it DOES actually display!
                        'externalUrl' => 0,
                        'recordUid' => intval($this->conf['recordUid']),
                        'freeIndexUid' => intval($this->conf['freeIndexUid']),
+                       'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
                );
+
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
 
                        // PROCESSING index_section
@@ -1276,6 +1420,9 @@ class tx_indexedsearch_indexer {
                        'phash' => $this->hash['phash'],
                        'fulltextdata' => implode(' ', $this->contentParts)
                );
+               if ($this->indexerConfig['fullTextDataLength']>0)       {
+                       $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
+               }
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
 
                        // PROCESSING index_debug
@@ -1396,7 +1543,7 @@ class tx_indexedsearch_indexer {
                        // Split filename:
                $fileParts = parse_url($file);
 
-                       // setting new
+                       // Setting new
                $fields = array(
                        'phash' => $hash['phash'],
                        'phash_grouping' => $hash['phash_grouping'],
@@ -1409,12 +1556,13 @@ class tx_indexedsearch_indexer {
                        'item_mtime' => $mtime,
                        'item_size' => $size,
                        'item_crdate' => $ctime,
-                       'tstamp' => time(),
-                       'crdate' => time(),
+                       'tstamp' => $GLOBALS['EXEC_TIME'],
+                       'crdate' => $GLOBALS['EXEC_TIME'],
                        'gr_list' => $this->conf['gr_list'],
                        'externalUrl' => $fileParts['scheme'] ? 1 : 0,
                        'recordUid' => intval($this->conf['recordUid']),
                        'freeIndexUid' => intval($this->conf['freeIndexUid']),
+                       'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
                );
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
 
@@ -1423,6 +1571,9 @@ class tx_indexedsearch_indexer {
                        'phash' => $hash['phash'],
                        'fulltextdata' => implode(' ', $contentParts)
                );
+               if ($this->indexerConfig['fullTextDataLength']>0)       {
+                       $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
+               }
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
 
                        // PROCESSING index_debug
@@ -1448,8 +1599,14 @@ class tx_indexedsearch_indexer {
         */
        function submitFile_grlist($hash)       {
                        // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
-               if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
+               $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
+                       'phash',
+                       'index_grlist',
+                       'phash=' . intval($hash) .
+                               ' AND (hash_gr_list=' . $this->md5inthash($this->defaultGrList) .
+                               ' OR hash_gr_list=' . $this->md5inthash($this->conf['gr_list']) . ')'
+               );
+               if (!$count) {
                        $this->submit_grlist($hash,$hash);
                }
        }
@@ -1504,11 +1661,11 @@ class tx_indexedsearch_indexer {
 
        /**
         * Check the mtime / tstamp of the currently indexed page/file (based on phash)
-        * Return positive integer if the page needs to being indexed!
+        * Return positive integer if the page needs to be indexed
         *
-        * @param       integer         mtime value to test against limits and indexed page.
+        * @param       integer         mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
         * @param       integer         "phash" used to select any already indexed page to see what its mtime is.
-        * @return      integer         Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceed and so indexing cannot occur.  -1) Mtimes matched so no need to reindex page. 0) N/A   1) Max age exceeded, page must be indexed again.   2) mtime of indexed page doesn't match mtime given for current content and we must index page.  3) No mtime was set, so we will index...  4) No indexed page found, so of course we will index.
+        * @return      integer         Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur.  -1) mtime matched so no need to reindex page. 0) N/A   1) Max age exceeded, page must be indexed again.   2) mtime of indexed page doesn't match mtime given for current content and we must index page.  3) No mtime was set, so we will index...  4) No indexed page found, so of course we will index.
         */
        function checkMtimeTstamp($mtime,$phash)        {
 
@@ -1518,20 +1675,20 @@ class tx_indexedsearch_indexer {
 
                        // If there was an indexing of the page...:
                if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
-                       if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time())     {               // If max age is exceeded, index the page
+                       if ($this->tstamp_maxAge && ($row['tstamp'] + $this->tstamp_maxAge) < $GLOBALS['EXEC_TIME']) {  // If max age is exceeded, index the page
                                $out = 1;               // The configured max-age was exceeded for the document and thus it's indexed.
                        } else {
-                               if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time())      {       // if minAge is not set or if minAge is exceeded, consider at mtime
+                               if (!$this->tstamp_minAge || ($row['tstamp'] + $this->tstamp_minAge) < $GLOBALS['EXEC_TIME']) { // if minAge is not set or if minAge is exceeded, consider at mtime
                                        if ($mtime)     {               // It mtime is set, then it's tested. If not, the page must clearly be indexed.
                                                if ($row['item_mtime'] != $mtime)       {       // And if mtime is different from the index_phash mtime, it's about time to re-index.
                                                        $out = 2;               // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
                                                } else {
                                                        $out = -1;              // mtime matched the document, so no changes detected and no content updated
                                                        if ($this->tstamp_maxAge)       {
-                                                               $this->log_setTSlogMessage('Mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
+                                                               $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
                                                        } else {
                                                                $this->updateTstamp($phash);    // Update the timestatmp
-                                                               $this->log_setTSlogMessage('Mtime matched, timestamp updated.',1);
+                                                               $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
                                                        }
                                                }
                                        } else {$out = 3;       }       // The minimum age was exceed, but mtime was not set, so the page was indexed.
@@ -1544,7 +1701,7 @@ class tx_indexedsearch_indexer {
        /**
         * Check content hash in phash table
         *
-        * @return      mixed           Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
+        * @return      mixed           Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
         */
        function checkContentHash()     {
                        // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
@@ -1557,11 +1714,11 @@ class tx_indexedsearch_indexer {
 
        /**
         * Check content hash for external documents
-        * Returns true if the document needs to be indexed (that is, there was no result)
+        * Returns TRUE if the document needs to be indexed (that is, there was no result)
         *
         * @param       integer         phash value to check (phash_grouping)
         * @param       integer         Content hash to check
-        * @return      boolean         Returns true if the document needs to be indexed (that is, there was no result)
+        * @return      boolean         Returns TRUE if the document needs to be indexed (that is, there was no result)
         */
        function checkExternalDocContentHash($hashGr,$content_md5h)     {
                $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
@@ -1578,8 +1735,11 @@ class tx_indexedsearch_indexer {
         * @return      void
         */
        function is_grlist_set($phash_x)        {
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
-               return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
+               return $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
+                       'phash_x',
+                       'index_grlist',
+                       'phash_x=' . intval($phash_x)
+               );
        }
 
        /**
@@ -1607,7 +1767,7 @@ class tx_indexedsearch_indexer {
         */
        function updateTstamp($phash,$mtime=0)  {
                $updateFields = array(
-                       'tstamp' => time()
+                       'tstamp' => $GLOBALS['EXEC_TIME']
                );
                if ($mtime)     { $updateFields['item_mtime'] = intval($mtime); }
 
@@ -1615,6 +1775,20 @@ class tx_indexedsearch_indexer {
        }
 
        /**
+        * Update SetID of the index_phash record.
+        *
+        * @param       integer         phash value
+        * @return      void
+        */
+       function updateSetId($phash)    {
+               $updateFields = array(
+                       'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
+               );
+
+               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
+       }
+
+       /**
         * Update parsetime for phash row.
         *
         * @param       integer         phash value.
@@ -1680,7 +1854,16 @@ class tx_indexedsearch_indexer {
                }
        }
 
+       /**
+        * Includes the crawler class
+        *
+        * @return      void
+        */
+       function includeCrawlerClass()  {
+               global $TYPO3_CONF_VARS;
 
+               require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
+       }
 
 
 
@@ -1704,9 +1887,8 @@ class tx_indexedsearch_indexer {
         * @return      void
         */
        function checkWordList($wl) {
-               reset($wl);
                $phashArr = array();
-               while(list($key,) = each($wl)) {
+               foreach ($wl as $key => $value) {
                        $phashArr[] = $wl[$key]['hash'];
                }
                if (count($phashArr))   {
@@ -1719,8 +1901,7 @@ class tx_indexedsearch_indexer {
                                        unset($wl[$row['baseword']]);
                                }
 
-                               reset($wl);
-                               while(list($key,$val)=each($wl)) {
+                               foreach ($wl as $key => $val) {
                                        $insertFields = array(
                                                'wid' => $val['hash'],
                                                'baseword' => $key,
@@ -1851,31 +2032,6 @@ class tx_indexedsearch_indexer {
                return hexdec(substr(md5($str),0,7));
        }
 
-       /**
-        * Calculates the cHash value of input GET array (for constructing cHash values if needed)
-        *
-        * @param       array           Array of GET parameters to encode
-        * @return      void
-        */
-       function makeCHash($paramArray) {
-               $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
-               $params = explode('&',substr($addQueryParams,1));       // Splitting parameters up
-
-                       // Make array:
-               $pA = array();
-               foreach($params as $theP)       {
-                       $pKV = explode('=', $theP);     // SPlitting single param by '=' sign
-                       if (!t3lib_div::inList('id,type,no_cache,cHash,MP,ftu',$pKV[0]))        {
-                               $pA[$pKV[0]] = (string)rawurldecode($pKV[1]);
-                       }
-               }
-               $pA['encryptionKey'] = $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'];
-               ksort($pA);
-
-               return t3lib_div::shortMD5(serialize($pA));
-       }
-
-
 
 
 
@@ -1923,10 +2079,36 @@ class tx_indexedsearch_indexer {
                if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
                $this->internal_log[] = $msg;
        }
+
+
+
+
+
+
+
+
+       /**************************
+        *
+        * tslib_fe hooks:
+        *
+        **************************/
+
+       /**
+        * Makes sure that keywords are space-separated. This is impotant for their
+        * proper displaying as a part of fulltext index.
+        *
+        * @param string $keywordList
+        * @return string
+        * @see http://bugs.typo3.org/view.php?id=1436
+        */
+       protected function addSpacesToKeywordList($keywordList) {
+               $keywords = t3lib_div::trimExplode(',', $keywordList);
+               return ' ' . implode(', ', $keywords) . ' ';
+       }
 }
 
 
-if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php'])   {
-       include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
+if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php'])) {
+       include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
 }
-?>
+?>
\ No newline at end of file