Indexed Search modifications for support of cronjob based indexing. More to come...
authorKasper Skårhøj <kasper@typo3.org>
Wed, 16 Nov 2005 00:36:38 +0000 (00:36 +0000)
committerKasper Skårhøj <kasper@typo3.org>
Wed, 16 Nov 2005 00:36:38 +0000 (00:36 +0000)
git-svn-id: https://svn.typo3.org/TYPO3v4/Core/trunk@865 709f56b5-9817-0410-a4d7-c38de5d9e867

typo3/sysext/indexed_search/class.crawler.php [new file with mode: 0755]
typo3/sysext/indexed_search/class.indexer.php
typo3/sysext/indexed_search/doc/TODO.txt
typo3/sysext/indexed_search/ext_localconf.php
typo3/sysext/indexed_search/ext_tables.sql
typo3/sysext/indexed_search/modfunc1/class.tx_indexedsearch_modfunc1.php
typo3/sysext/indexed_search/tca.php

diff --git a/typo3/sysext/indexed_search/class.crawler.php b/typo3/sysext/indexed_search/class.crawler.php
new file mode 100755 (executable)
index 0000000..3d520e1
--- /dev/null
@@ -0,0 +1,533 @@
+<?php
+/***************************************************************
+*  Copyright notice
+*
+*  (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
+*  All rights reserved
+*
+*  This script is part of the TYPO3 project. The TYPO3 project is
+*  free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2 of the License, or
+*  (at your option) any later version.
+*
+*  The GNU General Public License can be found at
+*  http://www.gnu.org/copyleft/gpl.html.
+*  A copy is found in the textfile GPL.txt and important notices to the license
+*  from the author is found in LICENSE.txt distributed with these scripts.
+*
+*
+*  This script is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*
+*  This copyright notice MUST APPEAR in all copies of the script!
+***************************************************************/
+/**
+ * Crawler hook for indexed search. Works with the "crawler" extension
+ *
+ * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
+ */
+/**
+ * [CLASS/FUNCTION INDEX of SCRIPT]
+ *
+ *
+ *
+ *   59: class tx_indexedsearch_crawler
+ *   70:     function crawler_init(&$pObj)
+ *  119:     function crawler_execute($params,&$pObj)
+ *  180:     function checkUrl($url,$urlLog,$baseUrl)
+ *  212:     function indexExtUrl($url, $pageId, $rl, $cfgUid)
+ *  251:     function loadIndexerClass()
+ *  263:     function getUidRootLineForClosestTemplate($id)
+ *
+ * TOTAL FUNCTIONS: 6
+ * (This index is automatically created/updated by the extension "extdeveval")
+ *
+ */
+
+
+
+/**
+ * Crawler hook for indexed search. Works with the "crawler" extension
+ *
+ * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
+ * @package TYPO3
+ * @subpackage tx_indexedsearch
+ */
+class tx_indexedsearch_crawler {
+
+               // Static:
+       var $secondsPerExternalUrl = 3;         // Number of seconds to use as interval between queued indexing operations of URLs
+
+               // Internal, dynamic:
+       var $instanceCounter = 0;               // Counts up for each added URL
+
+               // Internal, static:
+       var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler';               // The object reference to this class.
+
+       /**
+        * Initialization of crawler hook.
+        * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
+        * In reality we select indexing configurations and evaluate if any of them needs to run.
+        *
+        * @param       object          Parent object (tx_crawler lib)
+        * @return      void
+        */
+       function crawler_init(&$pObj){
+
+                       // Select all indexing configuration which are waiting to be activated:
+               $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
+                       'uid,pid,first_run_time,frequency,last_run,type,externalUrl,filepath',
+                       'index_config',
+                       'hidden=0
+                               AND (starttime=0 OR starttime<='.time().')
+                               AND set_id=0
+                               '.t3lib_BEfunc::deleteClause('index_config')
+
+               );
+
+                       // For each configuration, check if it should be executed and if so, start:
+               foreach($indexingConfigurations as $cfgRec)     {
+
+                               // Generate a unique set-ID:
+                       $setId = t3lib_div::md5int(microtime());
+
+                               // Start process by updating index-config record:
+                       $field_array = array (
+                               'set_id' => $setId,
+                               'session_data' => '',
+                       );
+                       $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
+
+                               // Based on configuration type:
+                       switch($cfgRec['type']) {
+                               case 1:
+                                               // Parameters:
+                                       $params = array(
+                                               'indexConfigUid' => $cfgRec['uid'],
+                                               'url' => 'Records (start)',
+                                               'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
+                                       );
+                                               //
+                                       $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
+                               break;
+                               case 3: // External URL:
+
+                                               // Parameters:
+                                       $params = array(
+                                               'indexConfigUid' => $cfgRec['uid'],             // General
+                                               'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),      // General
+                                               'url' => $cfgRec['externalUrl'],        // Partly general... (for URL and file types)
+                                               'depth' => 0    // Specific for URL and file types
+                                       );
+
+                                       $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
+                               break;
+                               case 2:
+
+                                               // Parameters:
+                                       $params = array(
+                                               'indexConfigUid' => $cfgRec['uid'],             // General
+                                               'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),      // General
+                                               'url' => $cfgRec['filepath'],   // Partly general... (for URL and file types)
+                                               'depth' => 0    // Specific for URL and file types
+                                       );
+
+                                       $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
+                               break;
+                       }
+               }
+
+                       // Finally, look up all old index configurations which are finished and needs to be reset and done.
+               $this->cleanUpOldRunningConfigurations();
+       }
+
+       /**
+        * Call back function for execution of a log element
+        *
+        * @param       array           Params from log element
+        * @param       object          Parent object (tx_crawler lib)
+        * @return      array           Result array
+        */
+       function crawler_execute($params,&$pObj)        {
+
+                       // Indexer configuration ID must exist:
+               if ($params['indexConfigUid'])  {
+                       list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
+                               '*',
+                               'index_config',
+                               'uid='.intval($params['indexConfigUid'])
+                       );
+
+                       if (is_array($cfgRec))  {
+
+                                       // Unpack session data:
+                               $session_data = unserialize($cfgRec['session_data']);
+
+                                       // Select which type:
+                               switch($cfgRec['type']) {
+                                       case 1:
+                                               if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']]))   {
+
+                                                               // Init session data array if not already:
+                                                       if (!is_array($session_data))   {
+                                                               $session_data = array(
+                                                                       'uid' => 0
+                                                               );
+                                                       }
+
+                                                               // Init:
+                                                       $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $this->pObj->id;
+                                                       $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
+
+                                                               // Get root line:
+                                                       $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
+
+                                                               // Load indexer if not yet.
+                                                       $this->loadIndexerClass();
+
+                                                               // Select
+                                                       $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
+                                                                               '*',
+                                                                               $cfgRec['table2index'],
+                                                                               'pid = '.intval($pid).'
+                                                                                       AND uid > '.intval($session_data['uid']).
+                                                                                       t3lib_BEfunc::deleteClause($cfgRec['table2index']),
+                                                                               '',
+                                                                               'uid',
+                                                                               '2'
+                                                                       );
+
+                                                               // Traverse:
+                                                       if (count($recs))       {
+                                                               foreach($recs as $r)    {
+
+                                                                               // (Re)-Indexing a row from a table:
+                                                                       $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
+                                                                       parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
+                                                                       $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
+                                                                       $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
+                                                                       $indexerObj->forceIndexing = TRUE;
+
+                                                                       $theContent = '';
+                                                                       foreach($fieldList as $k => $v) {
+                                                                               if (!$k)        {
+                                                                                       $theTitle = $r[$v];
+                                                                               } else {
+                                                                                       $theContent.= $r[$v].' ';
+                                                                               }
+                                                                       }
+
+                                                                       $indexerObj->backend_indexAsTYPO3Page(
+                                                                               $theTitle,
+                                                                               '',
+                                                                               '',
+                                                                               $theContent,
+                                                                               $GLOBALS['LANG']->charSet,
+                                                                               $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
+                                                                               $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
+                                                                               $r['uid']
+                                                                       );
+
+                                                                       #debug($indexerObj->internal_log);
+
+                                                                               // Update the UID we last processed:
+                                                                       $session_data['uid'] = $r['uid'];
+                                                               }
+
+
+                                                                       // Parameters:
+                                                               $nparams = array(
+                                                                       'indexConfigUid' => $cfgRec['uid'],
+                                                                       'url' => 'Records from UID#'.($r['uid']+1).'-?',
+                                                                       'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
+                                                               );
+                                                                       //
+                                                               $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
+                                                       }
+                                               }
+                                       break;
+                                       case 3: // External URL:
+
+                                                       // Init session data array if not already:
+                                               if (!is_array($session_data))   {
+                                                       $session_data = array(
+                                                               'urlLog' => array($params['url'])
+                                                       );
+                                               }
+
+                                                       // Index the URL:
+                                               $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
+                                               $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
+
+                                                       // Add more elements to log now:
+                                               if ($params['depth'] < $cfgRec['depth'])        {
+                                                       foreach($subUrls as $url)       {
+                                                               if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl']))        {
+                                                                       $this->instanceCounter++;
+                                                                       $session_data['urlLog'][] = $url;
+
+                                                                               // Parameters:
+                                                                       $nparams = array(
+                                                                               'indexConfigUid' => $cfgRec['uid'],
+                                                                               'url' => $url,
+                                                                               'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
+                                                                               'depth' => $params['depth']+1
+                                                                       );
+                                                                       $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
+                                                               }
+                                                       }
+                                               }
+                                       break;
+                                       case 2:
+
+                                                       // Prepare path, making it absolute and checking:
+                                               $readpath = $params['url'];
+                                               if (!t3lib_div::isAbsPath($readPath))   {
+                                                       $readpath = t3lib_div::getFileAbsFileName($readpath);
+                                               }
+
+                                               if (t3lib_div::isAllowedAbsPath($readpath))     {
+                                                       if (@is_file($readpath))        {       // If file, index it!
+
+                                                                       // Get root line:
+                                                               $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
+
+                                                                       // Load indexer if not yet.
+                                                               $this->loadIndexerClass();
+
+                                                                       // (Re)-Indexing file on page.
+                                                               $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
+                                                               $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
+                                                               $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
+                                                               $indexerObj->hash['phash'] = -1;        // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
+
+                                                                       // Index document:
+                                                               $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
+                                                       } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
+
+                                                                       // Select files and directories in path:
+                                                               $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
+                                                               $fileArr = array();
+                                                               $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
+
+                                                               $directoryList = t3lib_div::get_dirs($readpath);
+                                                               if (is_array($directoryList) && $params['depth'] < $cfgRec['depth'])    {
+                                                                       foreach ($directoryList as $subdir)     {
+                                                                               if ((string)$subdir!='')        {
+                                                                                       $files[]= $readpath.$subdir.'/';
+                                                                               }
+                                                                       }
+                                                               }
+                                                               $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
+
+                                                                       // traverse the items and create log entries:
+                                                               foreach($files as $path)        {
+                                                                       $this->instanceCounter++;
+                                                                       if ($path!==$params['url'])     {
+                                                                                       // Parameters:
+                                                                               $nparams = array(
+                                                                                       'indexConfigUid' => $cfgRec['uid'],
+                                                                                       'url' => $path,
+                                                                                       'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
+                                                                                       'depth' => $params['depth']+1
+                                                                               );
+                                                                               $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
+                                                                       }
+                                                               }
+                                                       }
+                                               }
+                                       break;
+                               }
+
+                                       // Save process data which might be modified:
+                               $field_array = array (
+                                       'session_data' => serialize($session_data)
+                               );
+                               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
+                       }
+               }
+
+               return array('log' => $params);
+       }
+
+       /**
+        * Look up all old index configurations which are finished and needs to be reset and done
+        *
+        * @return      void
+        */
+       function cleanUpOldRunningConfigurations()      {
+
+                       // Lookup running index configurations:
+               $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
+                       'uid,set_id',
+                       'index_config',
+                       'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
+               );
+
+                       // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
+               foreach($runningIndexingConfigurations as $cfgRec)      {
+
+                               // Look for ended processes:
+                       list($queued_items) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
+                               'count(*) AS count',
+                               'tx_crawler_queue',
+                               'set_id='.intval($cfgRec['set_id']).' AND exec_time=0'
+                       );
+
+                       if (!$queued_items['count'])    {
+
+                                       // Lookup old phash rows:
+                               $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
+                                       'phash,freeIndexUid,freeIndexSetId,externalUrl',
+                                       'index_phash',
+                                       'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
+                               );
+
+                               foreach($oldPhashRows as $pHashRow)     {
+                                               // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
+                                       $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
+                                       foreach($tableArr as $table)    {
+                                               $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
+                                       }
+                               }
+
+                                       // End process by updating index-config record:
+                               $field_array = array (
+                                       'set_id' => 0,
+                                       'session_data' => '',
+                               );
+                               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
+                       }
+               }
+       }
+
+
+
+
+
+
+
+       /*****************************************
+        *
+        * Helper functions
+        *
+        *****************************************/
+
+       /**
+        * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
+        *
+        * @param       string          URL
+        * @param       array           Array of already indexed URLs (input url is looked up here and must not exist already)
+        * @param       string          Base URL of the indexing process (input URL must be "inside" the base URL!)
+        * @return      string          Returls the URL if OK, otherwise false
+        */
+       function checkUrl($url,$urlLog,$baseUrl)        {
+               $url = ereg_replace('\/\/$','/',$url);
+               list($url) = explode('#',$url);
+
+               if (!strstr($url,'../'))        {
+                       if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
+                               if (!in_array($url,$urlLog))    {
+                                       return $url;
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Indexing External URL
+        *
+        * @param       string          URL, http://....
+        * @param       integer         Page id to relate indexing to.
+        * @param       array           Rootline array to relate indexing to
+        * @param       integer         Configuration UID
+        * @param       integer         Set ID
+        * @return      array           URLs found on this page
+        */
+       function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)       {
+
+                       // Load indexer if not yet.
+               $this->loadIndexerClass();
+
+                       // Index external URL:
+               $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
+               $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
+               $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
+
+               $indexerObj->indexExternalUrl($url);
+               $url_qParts = parse_url($url);
+
+                       // Get URLs on this page:
+               $subUrls = array();
+               $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
+
+                                               // Traverse links:
+               foreach($list as $count => $linkInfo)   {
+
+                               // Decode entities:
+                       $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
+
+                       $qParts = parse_url($subUrl);
+                       if (!$qParts['scheme']) {
+                               $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
+                       }
+
+                       $subUrls[] = $subUrl;
+               }
+
+               return $subUrls;
+       }
+
+       /**
+        * Include indexer class.
+        *
+        * @return      void
+        */
+       function loadIndexerClass()     {
+               global $TYPO3_CONF_VARS;
+               require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
+       }
+
+       /**
+        * Get rootline for closest TypoScript template root.
+        * Algorithm same as used in Web > Template, Object browser
+        *
+        * @param       integer         The page id to traverse rootline back from
+        * @return      array           Array where the root lines uid values are found.
+        */
+       function getUidRootLineForClosestTemplate($id)  {
+               global $TYPO3_CONF_VARS;
+
+               require_once (PATH_t3lib."class.t3lib_page.php");
+               require_once (PATH_t3lib."class.t3lib_tstemplate.php");
+               require_once (PATH_t3lib."class.t3lib_tsparser_ext.php");
+
+
+
+               $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
+               $tmpl->tt_track = 0;    // Do not log time-performance information
+               $tmpl->init();
+
+                               // Gets the rootLine
+               $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
+               $rootLine = $sys_page->getRootLine($id);
+               $tmpl->runThroughTemplates($rootLine,0);        // This generates the constants/config + hierarchy info for the template.
+
+                       // Root line uids
+               $rootline_uids = array();
+               foreach($tmpl->rootLine as $rlkey => $rldat)    {
+                       $rootline_uids[$rlkey] = $rldat['uid'];
+               }
+
+               return $rootline_uids;
+       }
+}
+
+
+if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php'])   {
+       include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
+}
+?>
\ No newline at end of file
index 4f8566c..6ac2c42 100755 (executable)
@@ -263,6 +263,7 @@ class tx_indexedsearch_indexer {
                                                        // Set to zero:
                                                $this->conf['recordUid'] = 0;
                                                $this->conf['freeIndexUid'] = 0;
+                                               $this->conf['freeIndexSetId'] = 0;
 
                                                        // Init and start indexing:
                                                $this->init();
@@ -318,6 +319,7 @@ class tx_indexedsearch_indexer {
 
                        // Set to defaults
                $this->conf['freeIndexUid'] = 0;
+               $this->conf['freeIndexSetId'] = 0;
                $this->conf['page_cache_reg1'] = '';
 
                        // Root line uids
@@ -337,8 +339,9 @@ class tx_indexedsearch_indexer {
         * @param       integer         Free index UID
         * @return      void
         */
-       function backend_setFreeIndexUid($freeIndexUid) {
+       function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)      {
                $this->conf['freeIndexUid'] = $freeIndexUid;
+               $this->conf['freeIndexSetId'] = $freeIndexSetId;
        }
 
        /**
@@ -568,6 +571,7 @@ class tx_indexedsearch_indexer {
                                $this->log_pull();
                        } else {
                                $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
+                               $this->updateSetId($this->hash['phash']);
                                $this->update_grlist($checkCHash['phash'],$this->hash['phash']);        // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
                                $this->updateRootline();
                                $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
@@ -842,7 +846,7 @@ class tx_indexedsearch_indexer {
                                t3lib_div::writeFile($tmpFile, $content);
 
                                        // Index that file:
-                               $this->indexRegularDocument($externalUrl, FALSE, $tmpFile, 'html');
+                               $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');      // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
                                unlink($tmpFile);
                        }
                }
@@ -1287,7 +1291,9 @@ class tx_indexedsearch_indexer {
                        'externalUrl' => 0,
                        'recordUid' => intval($this->conf['recordUid']),
                        'freeIndexUid' => intval($this->conf['freeIndexUid']),
+                       'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
                );
+
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
 
                        // PROCESSING index_section
@@ -1440,6 +1446,7 @@ class tx_indexedsearch_indexer {
                        'externalUrl' => $fileParts['scheme'] ? 1 : 0,
                        'recordUid' => intval($this->conf['recordUid']),
                        'freeIndexUid' => intval($this->conf['freeIndexUid']),
+                       'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
                );
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
 
@@ -1640,6 +1647,20 @@ class tx_indexedsearch_indexer {
        }
 
        /**
+        * Update SetID
+        *
+        * @param       integer         phash value
+        * @return      void
+        */
+       function updateSetId($phash)    {
+               $updateFields = array(
+                       'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
+               );
+
+               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
+       }
+
+       /**
         * Update parsetime for phash row.
         *
         * @param       integer         phash value.
index 0976416..56440e8 100755 (executable)
@@ -1,6 +1,35 @@
 ***************
 TODO / projects:
-*****************
+***************
+
+Version 3:
+- Refactor "class.crawler.php"
+- Add timing controls
+- Hook + example for external clients
+- IndexCfg option:
+       - Exclude URLs (regex?), possibly with graphical interface selecting it from indexed content.
+       - Record-indexing: support languageField in records
+       - Crawling in the night (enter time for indexing configurations), but possible to force.
+       - New type: Page Tree (traversing a part at night like crawler allows us...)
+- Indexing configuration overview (including status and manual clearing possibility)
+- Set up test environment with CLI running
+
+Frontend:
+       - Searching in certain external site
+       - Mac "spotlight" like searching? (Define which categories are bundled and which not)
+       - Alternative presentationer af når records er indexerede.
+
+
+
+Testing indexing crawler for:
+       - 3DS
+       - TYPO3.org copy
+       - Metropol
+       - FI
+       - Link Factory
+       - Brunata
+
+**************
 
 Bugs / Issues:
 - The checkbox "No Search" in the page header is only respected by indexed_search during indexing! (A page will not be indexed when "No Search" is set). However when searching results are not filtered based on this flag - so if a page is indexed before the no search flag is set it will be found in search results. To change this is hard because the getTreeList() function that fetches all page ids cannot take a where-clause to filter it out but must have hardcoded support. Alternatively the pages table must be joined into the search result so we can select on the field. A solution is still not agreed upon.
@@ -18,38 +47,18 @@ Search test:
 - external media on multiple pages with DIFFERENT languages?
 
 Templating / Display in plugin:
-- Templating
-       - with new Template API?
-       - Still need to put a group together.
 - Support for FE visning af resultater i extra niveaer (ud over niv. 1,2 som er hardcoded)?
 - Configurable language parameter (hardcoded to "L" now)
 
-Indexing configurations (temporarily disabled):
-       - Alternative presentationer af når records er indexerede.
-       - incl. meta-data?
+Indexing configurations:
        - Tabelvælger som en del af sektionsvælgeren i frontend
-       - Record-indexing: support languageField in records
        Config i backend through flexforms:
                - baseUrl for external files?
                - language setting for files and external URLs?
 
-CLI:
+CLI feature ideas:
 - Removal of old indexes
        - delete results with large tstamp (thats all....)
-- Indexing configurations
-       - (Indexing of records from tables should be done automatically in TCEmain with a hook for create/update/delete)
-       - Look up all index configurations
-       - Look up phash records (field tstamp) based on config-uid
-               - For files: read files from directories, compare mtime with records;
-               - For URLs: Forced
-               - For records: read records
-                       - All new entries are indexed, all old are removed, all changed are re-indexed
-       DKM:
-       - Exclude URLs (regex?), possibly with graphical interface selecting it from indexed content.
-       - Searching in certain external site
-       - Mac "spotlight" like searching? (Define which categories are bundled and which not)
-       - Crawling in the night (enter time for indexing configurations), but possible to force.
-       - Services: Way to hook in content as a forth "indexing configuration" type!
 
 Backend modules:
 - Much nicer detail display
@@ -63,14 +72,6 @@ Ideas:
 Hook development:
 - Example of search-SQL hook
 
-Testing indexing crawler for:
-       - 3DS
-       - TYPO3.org copy
-       - Metropol
-       - FI
-       - Link Factory
-       - Brunata
-
 Documentation:
 - Configuration possibilities (piVars, TypoScript, Hooks etc)
 - How to setup up, analyse and debug indexed search (manual)
index 4d4a56f..1ecb0c0 100755 (executable)
@@ -13,6 +13,7 @@ $TYPO3_CONF_VARS['SC_OPTIONS']['tslib/class.tslib_fe.php']['headerNoCache']['tx_
 
        // Register with "crawler" extension:
 $TYPO3_CONF_VARS['EXTCONF']['crawler']['procInstructions']['tx_indexedsearch_reindex'] = 'Re-indexing';
+$TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks']['tx_indexedsearch_crawl'] = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler';
 
        // Configure default document parsers:
 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] = array(
index 46086ea..169dd42 100755 (executable)
@@ -32,8 +32,10 @@ CREATE TABLE index_phash (
   externalUrl tinyint(3) DEFAULT '0' NOT NULL,
   recordUid int(11) DEFAULT '0' NOT NULL,
   freeIndexUid int(11) DEFAULT '0' NOT NULL,
+  freeIndexSetId int(11) DEFAULT '0' NOT NULL,
   PRIMARY KEY (phash),
-  KEY phash_grouping (phash_grouping)
+  KEY phash_grouping (phash_grouping),
+  KEY freeIndexUid (freeIndexUid)
 );
 
 #
@@ -153,6 +155,13 @@ CREATE TABLE index_config (
     cruser_id int(11) unsigned DEFAULT '0' NOT NULL,
     hidden tinyint(4) unsigned DEFAULT '0' NOT NULL,
     starttime int(11) unsigned DEFAULT '0' NOT NULL,
+
+    set_id int(11) DEFAULT '0' NOT NULL,
+    session_data mediumtext NOT NULL,
+    first_run_time int(11) unsigned DEFAULT '0' NOT NULL,
+    frequency int(11) unsigned DEFAULT '0' NOT NULL,
+    last_run int(11) unsigned DEFAULT '0' NOT NULL,
+
     title tinytext NOT NULL,
     description text NOT NULL,
     type int(11) unsigned DEFAULT '0' NOT NULL,
index b51e40f..3dd84f0 100755 (executable)
@@ -136,7 +136,6 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
                                0 => 'Overview',
                                1 => 'Technical Details',
                                2 => 'Words and content',
-                               3 => 'Indexing'
                        )
                );
     }
@@ -217,22 +216,13 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
                } else {        // Detail listings:
                                // Depth function menu:
                        $h_func = t3lib_BEfunc::getFuncMenu($this->pObj->id,'SET[type]',$this->pObj->MOD_SETTINGS['type'],$this->pObj->MOD_MENU['type'],'index.php');
-                       if (t3lib_div::inList('0,1,2',$this->pObj->MOD_SETTINGS['type']))       {
-                               $h_func.= t3lib_BEfunc::getFuncMenu($this->pObj->id,'SET[depth]',$this->pObj->MOD_SETTINGS['depth'],$this->pObj->MOD_MENU['depth'],'index.php');
+                       $h_func.= t3lib_BEfunc::getFuncMenu($this->pObj->id,'SET[depth]',$this->pObj->MOD_SETTINGS['depth'],$this->pObj->MOD_MENU['depth'],'index.php');
 
-                                       // Show title / function menu:
-                               $theOutput.=$this->pObj->doc->spacer(5);
-                               $theOutput.=$this->pObj->doc->section($LANG->getLL('title'),$h_func,0,1);
-
-                               $theOutput.=$this->drawTableOfIndexedPages();
-                       } else {
-
-                                       // Show title / function menu:
-                               $theOutput.= $this->pObj->doc->spacer(5);
-                               $theOutput.= $this->pObj->doc->section($LANG->getLL('title'),$h_func,0,1);
+                               // Show title / function menu:
+                       $theOutput.=$this->pObj->doc->spacer(5);
+                       $theOutput.=$this->pObj->doc->section($LANG->getLL('title'),$h_func,0,1);
 
-                               $theOutput.= $this->extraIndexing();
-                       }
+                       $theOutput.=$this->drawTableOfIndexedPages();
                }
 
         return $theOutput;
@@ -317,7 +307,7 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
                                        'ISEC.*, IP.*, count(*) AS count_val',
                                        'index_phash IP, index_section ISEC',
                                        'IP.phash = ISEC.phash AND ISEC.page_id = '.intval($data['uid']),
-                                       'IP.phash,IP.phash_grouping,IP.cHashParams,IP.data_filename,IP.data_page_id,IP.data_page_reg1,IP.data_page_type,IP.data_page_mp,IP.gr_list,IP.item_type,IP.item_title,IP.item_description,IP.item_mtime,IP.tstamp,IP.item_size,IP.contentHash,IP.crdate,IP.parsetime,IP.sys_language_uid,IP.item_crdate,ISEC.phash,ISEC.phash_t3,ISEC.rl0,ISEC.rl1,ISEC.rl2,ISEC.page_id,ISEC.uniqid,IP.externalUrl,IP.recordUid,IP.freeIndexUid',
+                                       'IP.phash,IP.phash_grouping,IP.cHashParams,IP.data_filename,IP.data_page_id,IP.data_page_reg1,IP.data_page_type,IP.data_page_mp,IP.gr_list,IP.item_type,IP.item_title,IP.item_description,IP.item_mtime,IP.tstamp,IP.item_size,IP.contentHash,IP.crdate,IP.parsetime,IP.sys_language_uid,IP.item_crdate,ISEC.phash,ISEC.phash_t3,ISEC.rl0,ISEC.rl1,ISEC.rl2,ISEC.page_id,ISEC.uniqid,IP.externalUrl,IP.recordUid,IP.freeIndexUid,IP.freeIndexSetId',
                                        'IP.item_type, IP.tstamp',
                                        ($this->maxListPerPage+1)
                                );
@@ -441,7 +431,7 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
                                $lines[] = '<td>'.$this->printRootlineInfo($row).'</td>';
                                $lines[] = '<td>'.($row['page_id'] ? $row['page_id'] : '&nbsp;').'</td>';
                                $lines[] = '<td>'.($row['phash_t3']!=$row['phash'] ? $row['phash_t3'] : '&nbsp;').'</td>';
-                               $lines[] = '<td>'.($row['freeIndexUid'] ? $row['freeIndexUid'] : '&nbsp;').'</td>';
+                               $lines[] = '<td>'.($row['freeIndexUid'] ? $row['freeIndexUid'].($row['freeIndexSetId']?'/'.$row['freeIndexSetId']:'') : '&nbsp;').'</td>';
                                $lines[] = '<td>'.($row['recordUid'] ? $row['recordUid'] : '&nbsp;').'</td>';
 
 
@@ -1270,172 +1260,6 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
 
        /********************************
         *
-        * Indexing of configurations
-        *
-        *******************************/
-
-       /**
-        * [Describe function...]
-        *
-        * @return      [type]          ...
-        */
-       function extraIndexing()        {
-
-                       // Select index configurations on this page
-               $ftrows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
-                                       '*',
-                                       'index_config',
-                                       'pid = '.intval($this->pObj->id).
-                                               ' AND hidden=0'.
-                                               ' AND starttime<'.time()
-                               );
-
-
-               $rl = $this->getUidRootLineForClosestTemplate($this->pObj->id);
-
-               foreach($ftrows as $cfgRow)             {
-                       switch($cfgRow['type']) {
-                               case 1:
-                                       if ($cfgRow['table2index'] && isset($GLOBALS['TCA'][$cfgRow['table2index']]))   {
-
-                                                       // Init:
-                                               $pid = intval($cfgRow['alternative_source_pid']) ? intval($cfgRow['alternative_source_pid']) : $this->pObj->id;
-                                               $fieldList = t3lib_div::trimExplode(',',$cfgRow['fieldlist'],1);
-
-                                                       // Select
-                                               $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
-                                                                       '*',
-                                                                       $cfgRow['table2index'],
-                                                                       'pid = '.intval($pid)
-                                                               );
-
-                                                       // Traverse:
-                                               foreach($recs as $r)    {
-                                                               // (Re)-Indexing a row from a table:
-                                                       $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
-                                                       parse_str(str_replace('###UID###',$r['uid'],$cfgRow['get_params']),$GETparams);
-                                                       $indexerObj->backend_initIndexer($this->pObj->id, 0, 0, '', $rl, $GETparams, $cfgRow['chashcalc'] ? TRUE : FALSE);
-                                                       $indexerObj->backend_setFreeIndexUid($cfgRow['uid']);
-
-                                                       $theContent = '';
-                                                       foreach($fieldList as $k => $v) {
-                                                               if (!$k)        {
-                                                                       $theTitle = $r[$v];
-                                                               } else {
-                                                                       $theContent.= $r[$v].' ';
-                                                               }
-                                                       }
-#debug($theContent,$theTitle);
-                                                       $indexerObj->backend_indexAsTYPO3Page(
-                                                                       $theTitle,
-                                                                       '',
-                                                                       '',
-                                                                       $theContent,
-                                                                       $GLOBALS['LANG']->charSet,
-                                                                       $r[$GLOBALS['TCA'][$cfgRow['table2index']]['ctrl']['tstamp']],
-                                                                       $r[$GLOBALS['TCA'][$cfgRow['table2index']]['ctrl']['crdate']],
-                                                                       $r['uid']
-                                                               );
-
-                                               }
-#debug($recs);
-                                       }
-                               break;
-                               case 2:
-                                       $readpath = $cfgRow['filepath'];
-                                       if (!t3lib_div::isAbsPath($readPath))   {
-                                               $readpath = t3lib_div::getFileAbsFileName($readpath);
-                                       }
-#debug($readpath,'$readpath');
-
-                                       if (t3lib_div::isAllowedAbsPath($readpath))     {
-                                               $extList = implode(',',t3lib_div::trimExplode(',',$cfgRow['extensions'],1));
-                                               $fileArr = array();
-                                               $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,$cfgRow['depth']);
-                                               $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
-#debug($files);
-                                               foreach($files as $path)        {
-                                                               // (Re)-Indexing file on page.
-                                                       $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
-                                                       $indexerObj->backend_initIndexer($this->pObj->id, 0, 0, '', $rl);
-                                                       $indexerObj->backend_setFreeIndexUid($cfgRow['uid']);
-                                                       $indexerObj->hash['phash'] = -1;        // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
-
-                                                       $indexerObj->indexRegularDocument($path, TRUE);
-
-#debug($indexerObj->internal_log,$resultRow['data_filename']);
-#debug($indexerObj->file_phash_arr,'file_phash_arr');
-#debug($indexerObj->hash,'hash');
-
-                                               }
-                                       }
-                               break;
-                               case 3:
-                                       if ($cfgRow['externalUrl'])     {
-                                               $this->indexExtUrlRecursively($cfgRow['externalUrl'], $cfgRow['depth'], $this->pObj->id, $rl, $cfgRow['uid']);
-                                       }
-                               break;
-                       }
-               }
-       }
-
-       /**
-        * Indexing URL recursively
-        * Still needs some work; eg. parameters to type, language, MP var is not passed yet...
-        *
-        * @param       string          URL, http://....
-        * @param       integer         Depth of recursion. 0 (zero) = only input URL
-        * @param       integer         Page id to relate indexing to.
-        * @param       array           Rootline array to relate indexing to
-        * @param       integer         Configuration UID
-        * @return      void
-        */
-       function indexExtUrlRecursively($url, $depth, $pageId, $rl, $cfgUid)    {
-
-                       // Index external URL:
-               $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
-               $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
-               $indexerObj->backend_setFreeIndexUid($cfgUid);
-
-               $indexerObj->indexExternalUrl($url);
-               $url_qParts = parse_url($url);
-
-                       // Recursion:
-               if ($depth>0)   {
-                       $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
-
-                                                       // Traverse links:
-                       foreach($list as $count => $linkInfo)   {
-
-                                       // Decode entities:
-                               $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
-
-                               $qParts = parse_url($linkSource);
-                               if (!$qParts['scheme']) {
-                                       $linkSource = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.$linkSource;
-                               }
-
-                               $this->indexExtUrlRecursively($linkSource, $depth-1, $pageId, $rl, $cfgUid);
-
-                                       // Temporary limit until we know how to handle hundreds of URLs with limited parsetime in PHP...
-                               if ($count>3)   break;
-                       }
-               }
-       }
-
-
-
-
-
-
-
-
-
-
-
-
-       /********************************
-        *
         * SQL functions
         *
         *******************************/
index b15fb1d..3172230 100644 (file)
@@ -9,7 +9,7 @@ $TCA['index_config'] = Array (
     'feInterface' => $TCA['index_config']['feInterface'],
     'columns' => Array (
         'hidden' => Array (
-            'label' => 'LLL:EXT:lang/locallang_general.php:LGL.hidden',
+            'label' => 'Disable',
             'config' => Array (
                 'type' => 'check',
                 'default' => '0'
@@ -139,7 +139,7 @@ $TCA['index_config'] = Array (
         '0' => Array('showitem' => 'title;;1;;2-2-2, description, type'),
         '1' => Array('showitem' => 'title;;1;;2-2-2, description, type;;;;3-3-3, table2index;;;;3-3-3, alternative_source_pid, fieldlist, get_params, chashcalc'),
         '2' => Array('showitem' => 'title;;1;;2-2-2, description, type;;;;3-3-3, filepath;;;;3-3-3, extensions, depth'),
-        '3' => Array('showitem' => 'title;;1;;2-2-2, description, type;;;;3-3-3, externalUrl;;;;3-3-3, depth')
+        '3' => Array('showitem' => 'title;;1;;2-2-2, description, type;;;;3-3-3, externalUrl;;;;3-3-3, depth'),
     ),
     'palettes' => Array (
         '1' => Array('showitem' => 'starttime,hidden')