More work on indexing configurations
authorKasper Skårhøj <kasper@typo3.org>
Thu, 22 Dec 2005 20:55:33 +0000 (20:55 +0000)
committerKasper Skårhøj <kasper@typo3.org>
Thu, 22 Dec 2005 20:55:33 +0000 (20:55 +0000)
git-svn-id: https://svn.typo3.org/TYPO3v4/Core/trunk@923 709f56b5-9817-0410-a4d7-c38de5d9e867

typo3/sysext/indexed_search/class.crawler.php
typo3/sysext/indexed_search/class.indexer.php
typo3/sysext/indexed_search/doc/TODO.txt
typo3/sysext/indexed_search/ext_localconf.php
typo3/sysext/indexed_search/ext_tables.php
typo3/sysext/indexed_search/ext_tables.sql
typo3/sysext/indexed_search/locallang_csh_indexcfg.xml [new file with mode: 0755]
typo3/sysext/indexed_search/locallang_db.xml
typo3/sysext/indexed_search/tca.php

index 3d520e1..342462f 100755 (executable)
  *
  *
  *
- *   59: class tx_indexedsearch_crawler
- *   70:     function crawler_init(&$pObj)
- *  119:     function crawler_execute($params,&$pObj)
- *  180:     function checkUrl($url,$urlLog,$baseUrl)
- *  212:     function indexExtUrl($url, $pageId, $rl, $cfgUid)
- *  251:     function loadIndexerClass()
- *  263:     function getUidRootLineForClosestTemplate($id)
+ *   80: class tx_indexedsearch_crawler
+ *   99:     function crawler_init(&$pObj)
+ *  197:     function crawler_execute($params,&$pObj)
+ *  256:     function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)
+ *  316:     function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)
+ *  385:     function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)
+ *  425:     function cleanUpOldRunningConfigurations()
  *
- * TOTAL FUNCTIONS: 6
+ *              SECTION: Helper functions
+ *  491:     function checkUrl($url,$urlLog,$baseUrl)
+ *  514:     function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
+ *  556:     function indexSingleRecord($r,$cfgRec,$rl=NULL)
+ *  605:     function loadIndexerClass()
+ *  617:     function getUidRootLineForClosestTemplate($id)
+ *  650:     function generateNextIndexingTime($cfgRec)
+ *  689:     function checkDeniedSuburls($url, $url_deny)
+ *
+ *              SECTION: Hook functions for TCEmain (indexing of records)
+ *  725:     function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj)
+ *
+ * TOTAL FUNCTIONS: 14
  * (This index is automatically created/updated by the extension "extdeveval")
  *
  */
 
 
 
+
+# To make sure the backend charset is available:
+require_once(PATH_typo3.'sysext/lang/lang.php');
+if (!is_object($GLOBALS['LANG']))      {
+       $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
+       $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
+}
+
+
 /**
  * Crawler hook for indexed search. Works with the "crawler" extension
  *
 class tx_indexedsearch_crawler {
 
                // Static:
-       var $secondsPerExternalUrl = 3;         // Number of seconds to use as interval between queued indexing operations of URLs
+       var $secondsPerExternalUrl = 3;         // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
 
                // Internal, dynamic:
-       var $instanceCounter = 0;               // Counts up for each added URL
+       var $instanceCounter = 0;               // Counts up for each added URL (type 3)
 
                // Internal, static:
        var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler';               // The object reference to this class.
@@ -79,13 +100,13 @@ class tx_indexedsearch_crawler {
 
                        // Select all indexing configuration which are waiting to be activated:
                $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
-                       'uid,pid,first_run_time,frequency,last_run,type,externalUrl,filepath',
+                       '*',
                        'index_config',
                        'hidden=0
                                AND (starttime=0 OR starttime<='.time().')
+                               AND timer_next_indexing<'.time().'
                                AND set_id=0
                                '.t3lib_BEfunc::deleteClause('index_config')
-
                );
 
                        // For each configuration, check if it should be executed and if so, start:
@@ -94,49 +115,71 @@ class tx_indexedsearch_crawler {
                                // Generate a unique set-ID:
                        $setId = t3lib_div::md5int(microtime());
 
+                               // Get next time:
+                       $nextTime = $this->generateNextIndexingTime($cfgRec);
+
                                // Start process by updating index-config record:
                        $field_array = array (
                                'set_id' => $setId,
+                               'timer_next_indexing' => $nextTime,
                                'session_data' => '',
                        );
                        $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
 
                                // Based on configuration type:
                        switch($cfgRec['type']) {
-                               case 1:
+                               case 1: // RECORDS:
+
                                                // Parameters:
                                        $params = array(
                                                'indexConfigUid' => $cfgRec['uid'],
-                                               'url' => 'Records (start)',
-                                               'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
+                                               'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
+                                               'url' => 'Records (start)',     // Just for show.
                                        );
                                                //
                                        $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
                                break;
-                               case 3: // External URL:
+                               case 2: // FILES:
 
                                                // Parameters:
                                        $params = array(
                                                'indexConfigUid' => $cfgRec['uid'],             // General
                                                'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),      // General
-                                               'url' => $cfgRec['externalUrl'],        // Partly general... (for URL and file types)
+                                               'url' => $cfgRec['filepath'],   // Partly general... (for URL and file types)
                                                'depth' => 0    // Specific for URL and file types
                                        );
 
                                        $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
                                break;
-                               case 2:
+                               case 3: // External URL:
 
                                                // Parameters:
                                        $params = array(
                                                'indexConfigUid' => $cfgRec['uid'],             // General
                                                'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),      // General
-                                               'url' => $cfgRec['filepath'],   // Partly general... (for URL and file types)
+                                               'url' => $cfgRec['externalUrl'],        // Partly general... (for URL and file types)
                                                'depth' => 0    // Specific for URL and file types
                                        );
 
                                        $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
                                break;
+                               default:
+                                       if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']])       {
+                                               $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
+
+                                               if (is_object($hookObj))        {
+
+                                                               // Parameters:
+                                                       $params = array(
+                                                               'indexConfigUid' => $cfgRec['uid'],             // General
+                                                               'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'),       // General
+                                                               'url' => $hookObj->initMessage($message),
+                                                       );
+
+                                                       $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
+                                               }
+                                       }
+                               break;
                        }
                }
 
@@ -147,7 +190,7 @@ class tx_indexedsearch_crawler {
        /**
         * Call back function for execution of a log element
         *
-        * @param       array           Params from log element
+        * @param       array           Params from log element. Must contain $params['indexConfigUid']
         * @param       object          Parent object (tx_crawler lib)
         * @return      array           Result array
         */
@@ -155,6 +198,8 @@ class tx_indexedsearch_crawler {
 
                        // Indexer configuration ID must exist:
                if ($params['indexConfigUid'])  {
+
+                               // Load the indexing configuration record:
                        list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
                                '*',
                                'index_config',
@@ -168,175 +213,22 @@ class tx_indexedsearch_crawler {
 
                                        // Select which type:
                                switch($cfgRec['type']) {
-                                       case 1:
-                                               if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']]))   {
-
-                                                               // Init session data array if not already:
-                                                       if (!is_array($session_data))   {
-                                                               $session_data = array(
-                                                                       'uid' => 0
-                                                               );
-                                                       }
-
-                                                               // Init:
-                                                       $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $this->pObj->id;
-                                                       $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
-
-                                                               // Get root line:
-                                                       $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
-
-                                                               // Load indexer if not yet.
-                                                       $this->loadIndexerClass();
-
-                                                               // Select
-                                                       $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
-                                                                               '*',
-                                                                               $cfgRec['table2index'],
-                                                                               'pid = '.intval($pid).'
-                                                                                       AND uid > '.intval($session_data['uid']).
-                                                                                       t3lib_BEfunc::deleteClause($cfgRec['table2index']),
-                                                                               '',
-                                                                               'uid',
-                                                                               '2'
-                                                                       );
-
-                                                               // Traverse:
-                                                       if (count($recs))       {
-                                                               foreach($recs as $r)    {
-
-                                                                               // (Re)-Indexing a row from a table:
-                                                                       $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
-                                                                       parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
-                                                                       $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
-                                                                       $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
-                                                                       $indexerObj->forceIndexing = TRUE;
-
-                                                                       $theContent = '';
-                                                                       foreach($fieldList as $k => $v) {
-                                                                               if (!$k)        {
-                                                                                       $theTitle = $r[$v];
-                                                                               } else {
-                                                                                       $theContent.= $r[$v].' ';
-                                                                               }
-                                                                       }
-
-                                                                       $indexerObj->backend_indexAsTYPO3Page(
-                                                                               $theTitle,
-                                                                               '',
-                                                                               '',
-                                                                               $theContent,
-                                                                               $GLOBALS['LANG']->charSet,
-                                                                               $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
-                                                                               $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
-                                                                               $r['uid']
-                                                                       );
-
-                                                                       #debug($indexerObj->internal_log);
-
-                                                                               // Update the UID we last processed:
-                                                                       $session_data['uid'] = $r['uid'];
-                                                               }
-
-
-                                                                       // Parameters:
-                                                               $nparams = array(
-                                                                       'indexConfigUid' => $cfgRec['uid'],
-                                                                       'url' => 'Records from UID#'.($r['uid']+1).'-?',
-                                                                       'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
-                                                               );
-                                                                       //
-                                                               $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
-                                                       }
-                                               }
+                                       case 1: // Records:
+                                               $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
+                                       break;
+                                       case 2: // Files
+                                               $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
                                        break;
                                        case 3: // External URL:
-
-                                                       // Init session data array if not already:
-                                               if (!is_array($session_data))   {
-                                                       $session_data = array(
-                                                               'urlLog' => array($params['url'])
-                                                       );
-                                               }
-
-                                                       // Index the URL:
-                                               $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
-                                               $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
-
-                                                       // Add more elements to log now:
-                                               if ($params['depth'] < $cfgRec['depth'])        {
-                                                       foreach($subUrls as $url)       {
-                                                               if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl']))        {
-                                                                       $this->instanceCounter++;
-                                                                       $session_data['urlLog'][] = $url;
-
-                                                                               // Parameters:
-                                                                       $nparams = array(
-                                                                               'indexConfigUid' => $cfgRec['uid'],
-                                                                               'url' => $url,
-                                                                               'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
-                                                                               'depth' => $params['depth']+1
-                                                                       );
-                                                                       $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
-                                                               }
-                                                       }
-                                               }
+                                               $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
                                        break;
-                                       case 2:
+                                       default:
+                                               if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']])       {
+                                                       $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
 
-                                                       // Prepare path, making it absolute and checking:
-                                               $readpath = $params['url'];
-                                               if (!t3lib_div::isAbsPath($readPath))   {
-                                                       $readpath = t3lib_div::getFileAbsFileName($readpath);
-                                               }
-
-                                               if (t3lib_div::isAllowedAbsPath($readpath))     {
-                                                       if (@is_file($readpath))        {       // If file, index it!
-
-                                                                       // Get root line:
-                                                               $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
-
-                                                                       // Load indexer if not yet.
-                                                               $this->loadIndexerClass();
-
-                                                                       // (Re)-Indexing file on page.
-                                                               $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
-                                                               $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
-                                                               $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
-                                                               $indexerObj->hash['phash'] = -1;        // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
-
-                                                                       // Index document:
-                                                               $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
-                                                       } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
-
-                                                                       // Select files and directories in path:
-                                                               $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
-                                                               $fileArr = array();
-                                                               $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
-
-                                                               $directoryList = t3lib_div::get_dirs($readpath);
-                                                               if (is_array($directoryList) && $params['depth'] < $cfgRec['depth'])    {
-                                                                       foreach ($directoryList as $subdir)     {
-                                                                               if ((string)$subdir!='')        {
-                                                                                       $files[]= $readpath.$subdir.'/';
-                                                                               }
-                                                                       }
-                                                               }
-                                                               $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
-
-                                                                       // traverse the items and create log entries:
-                                                               foreach($files as $path)        {
-                                                                       $this->instanceCounter++;
-                                                                       if ($path!==$params['url'])     {
-                                                                                       // Parameters:
-                                                                               $nparams = array(
-                                                                                       'indexConfigUid' => $cfgRec['uid'],
-                                                                                       'url' => $path,
-                                                                                       'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
-                                                                                       'depth' => $params['depth']+1
-                                                                               );
-                                                                               $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
-                                                                       }
-                                                               }
+                                                       if (is_object($hookObj))        {
+                                                               $this->pObj = &$pObj;   // For addQueueEntryForHook()
+                                                               $hookObj->indexOperation($cfgRec,$session_data,$params,$this);
                                                        }
                                                }
                                        break;
@@ -354,6 +246,179 @@ class tx_indexedsearch_crawler {
        }
 
        /**
+        * Indexing records from a table
+        *
+        * @param       array           Indexing Configuration Record
+        * @param       arrar           Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
+        * @param       array           Parameters from the log queue.
+        * @param       object          Parent object (from "crawler" extension!)
+        * @return      void
+        */
+       function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)   {
+               if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']]))   {
+
+                               // Init session data array if not already:
+                       if (!is_array($session_data))   {
+                               $session_data = array(
+                                       'uid' => 0
+                               );
+                       }
+
+                               // Init:
+                       $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $this->pObj->id;
+                       $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
+
+                               // Get root line:
+                       $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
+
+                               // Select
+                       $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
+                                               '*',
+                                               $cfgRec['table2index'],
+                                               'pid = '.intval($pid).'
+                                                       AND uid > '.intval($session_data['uid']).
+                                                       t3lib_BEfunc::deleteClause($cfgRec['table2index']),
+                                               '',
+                                               'uid',
+                                               $numberOfRecords
+                                       );
+
+                               // Traverse:
+                       if (count($recs))       {
+                               foreach($recs as $r)    {
+
+                                               // Index single record:
+                                       $this->indexSingleRecord($r,$cfgRec,$rl);
+
+                                               // Update the UID we last processed:
+                                       $session_data['uid'] = $r['uid'];
+                               }
+
+                                       // Finally, set entry for next indexing of batch of records:
+                               $nparams = array(
+                                       'indexConfigUid' => $cfgRec['uid'],
+                                       'url' => 'Records from UID#'.($r['uid']+1).'-?',
+                                       'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
+                               );
+                               $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
+                       }
+               }
+       }
+
+       /**
+        * Indexing files from fileadmin
+        *
+        * @param       array           Indexing Configuration Record
+        * @param       arrar           Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
+        * @param       array           Parameters from the log queue.
+        * @param       object          Parent object (from "crawler" extension!)
+        * @return      void
+        */
+       function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)   {
+
+                       // Prepare path, making it absolute and checking:
+               $readpath = $params['url'];
+               if (!t3lib_div::isAbsPath($readPath))   {
+                       $readpath = t3lib_div::getFileAbsFileName($readpath);
+               }
+
+               if (t3lib_div::isAllowedAbsPath($readpath))     {
+                       if (@is_file($readpath))        {       // If file, index it!
+
+                                       // Get root line (need to provide this when indexing external files)
+                               $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
+
+                                       // Load indexer if not yet.
+                               $this->loadIndexerClass();
+
+                                       // (Re)-Indexing file on page.
+                               $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
+                               $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
+                               $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
+                               $indexerObj->hash['phash'] = -1;        // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
+
+                                       // Index document:
+                               $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
+                       } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
+
+                                       // Select files and directories in path:
+                               $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
+                               $fileArr = array();
+                               $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
+
+                               $directoryList = t3lib_div::get_dirs($readpath);
+                               if (is_array($directoryList) && $params['depth'] < $cfgRec['depth'])    {
+                                       foreach ($directoryList as $subdir)     {
+                                               if ((string)$subdir!='')        {
+                                                       $files[]= $readpath.$subdir.'/';
+                                               }
+                                       }
+                               }
+                               $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
+
+                                       // traverse the items and create log entries:
+                               foreach($files as $path)        {
+                                       $this->instanceCounter++;
+                                       if ($path!==$params['url'])     {
+                                                       // Parameters:
+                                               $nparams = array(
+                                                       'indexConfigUid' => $cfgRec['uid'],
+                                                       'url' => $path,
+                                                       'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
+                                                       'depth' => $params['depth']+1
+                                               );
+                                               $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
+                                       }
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Indexing External URLs
+        *
+        * @param       array           Indexing Configuration Record
+        * @param       arrar           Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
+        * @param       array           Parameters from the log queue.
+        * @param       object          Parent object (from "crawler" extension!)
+        * @return      void
+        */
+       function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)   {
+
+                       // Init session data array if not already:
+               if (!is_array($session_data))   {
+                       $session_data = array(
+                               'urlLog' => array($params['url'])
+                       );
+               }
+
+                       // Index the URL:
+               $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
+               $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
+
+                       // Add more elements to log now:
+               if ($params['depth'] < $cfgRec['depth'])        {
+                       foreach($subUrls as $url)       {
+                               if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl']))        {
+                                       if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny']))      {
+                                               $this->instanceCounter++;
+                                               $session_data['urlLog'][] = $url;
+
+                                                       // Parameters:
+                                               $nparams = array(
+                                                       'indexConfigUid' => $cfgRec['uid'],
+                                                       'url' => $url,
+                                                       'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
+                                                       'depth' => $params['depth']+1
+                                               );
+                                               $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
+                                       }
+                               }
+                       }
+               }
+       }
+
+       /**
         * Look up all old index configurations which are finished and needs to be reset and done
         *
         * @return      void
@@ -381,7 +446,7 @@ class tx_indexedsearch_crawler {
 
                                        // Lookup old phash rows:
                                $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
-                                       'phash,freeIndexUid,freeIndexSetId,externalUrl',
+                                       'phash',
                                        'index_phash',
                                        'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
                                );
@@ -456,6 +521,7 @@ class tx_indexedsearch_crawler {
                $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
                $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
                $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
+               $indexerObj->hash['phash'] = -1;        // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
 
                $indexerObj->indexExternalUrl($url);
                $url_qParts = parse_url($url);
@@ -482,6 +548,58 @@ class tx_indexedsearch_crawler {
        }
 
        /**
+        * Indexing Single Record
+        *
+        * @param       array           Record to index
+        * @param       array           Configuration Record
+        * @param       array           Rootline array to relate indexing to
+        * @return      void
+        */
+       function indexSingleRecord($r,$cfgRec,$rl=NULL) {
+
+                       // Load indexer if not yet.
+               $this->loadIndexerClass();
+
+
+                       // Init:
+               $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
+               $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
+               $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
+               $sys_language_uid = $languageField ? $r[$languageField] : 0;
+
+                       // (Re)-Indexing a row from a table:
+               $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
+               parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
+               $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
+               $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
+               $indexerObj->forceIndexing = TRUE;
+
+               $theContent = '';
+               foreach($fieldList as $k => $v) {
+                       if (!$k)        {
+                               $theTitle = $r[$v];
+                       } else {
+                               $theContent.= $r[$v].' ';
+                       }
+               }
+
+                       // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
+               $indexerObj->backend_indexAsTYPO3Page(
+                       strip_tags($theTitle),
+                       '',
+                       '',
+                       strip_tags($theContent),
+                       $GLOBALS['LANG']->charSet,      // Requires that
+                       $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
+                       $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
+                       $r['uid']
+               );
+
+               #echo print_r($indexerObj->internal_log);
+               #echo print_r($indexerObj->contentParts);
+       }
+
+       /**
         * Include indexer class.
         *
         * @return      void
@@ -524,6 +642,135 @@ class tx_indexedsearch_crawler {
 
                return $rootline_uids;
        }
+
+       /**
+        * Generate the unix time stamp for next visit.
+        *
+        * @param       array           Index configuration record
+        * @return      integer         The next time stamp
+        */
+       function generateNextIndexingTime($cfgRec)      {
+               $currentTime = time();
+
+                       // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
+               if ($cfgRec['timer_frequency']<=24*3600)        {
+                       $aMidNight = mktime (0,0,0)-1*24*3600;
+               } else {
+                       $lastTime = $cfgRec['timer_next_indexing']?$cfgRec['timer_next_indexing']:time();
+                       $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
+               }
+
+                       // Find last offset time plus frequency in seconds:
+               $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
+               $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
+
+                       // Now, find out how many blocks of the length of frequency there is until the next time:
+               $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
+
+                       // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
+               $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
+
+               return $nextTime;
+/*             echo print_r(array(
+                       $cfgRec['timer_offset'],
+                       $frequencySeconds,
+                       date('d-m-Y H:i:s',$currentTime),
+                       date('d-m-Y H:i:s',$aMidNight),
+                       date('d-m-Y H:i:s',$lastSureOffset),
+                       date('d-m-Y H:i:s',$nextTime)
+               ));
+*/     }
+
+       /**
+        * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true.
+        *
+        * @param       string          URL to test
+        * @param       string          String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
+        * @return      boolean         TRUE if there is a matching URL (hence, do not index!)
+        */
+       function checkDeniedSuburls($url, $url_deny)    {
+               if (trim($url_deny))    {
+                       $url_denyArray = t3lib_div::trimExplode(chr(10),$url_deny,1);
+                       foreach($url_denyArray as $testurl)     {
+                               if (t3lib_div::isFirstPartOfStr($url,$testurl)) {
+                                       echo $url.' /// '.$url_deny.chr(10);
+                                       return TRUE;
+                               }
+                       }
+               }
+               return FALSE;
+       }
+
+
+       function addQueueEntryForHook($cfgRec, $title)  {
+
+               $nparams = array(
+                       'indexConfigUid' => $cfgRec['uid'],             // This must ALWAYS be the cfgRec uid!
+                       'url' => $title,
+                       'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')       // Also just for information. Its good style to show that its an indexing configuration that added the entry.
+               );
+               $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
+       }
+
+
+
+
+
+
+
+       /*************************
+        *
+        * Hook functions for TCEmain (indexing of records)
+        *
+        *************************/
+
+       /**
+        * TCEmain hook function for on-the-fly indexing of database records
+        *
+        * @param       string          Status "new" or "update"
+        * @param       string          Table name
+        * @param       string          Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
+        * @param       array           Field array of updated fields in the operation
+        * @param       object          Reference to tcemain calling object
+        * @return      void
+        */
+       function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj) {
+
+                       // Check if any fields are actually updated:
+               if (count($fieldArray)) {
+
+                               // Translate new ids.
+                       if ($status=='new')     {
+                               $id = $pObj->substNEWwithIDs[$id];
+                       }
+
+                               // Get full record and if exists, search for indexing configurations:
+                       $currentRecord = t3lib_BEfunc::getRecord($table,$id);
+                       if (is_array($currentRecord))   {
+
+                                       // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
+                               $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
+                                       '*',
+                                       'index_config',
+                                       'hidden=0
+                                               AND (starttime=0 OR starttime<='.time().')
+                                               AND set_id=0
+                                               AND type=1
+                                               AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
+                                               AND (
+                                                               (alternative_source_pid='.$GLOBALS['TYPO3_DB']->fullQuoteStr('','index_config').' AND pid='.intval($currentRecord['pid']).')
+                                                               OR (alternative_source_pid='.$GLOBALS['TYPO3_DB']->fullQuoteStr($currentRecord['pid'],'index_config').')
+                                                       )
+                                               AND records_indexonchange=1
+                                               '.t3lib_BEfunc::deleteClause('index_config')
+                               );
+
+                               foreach($indexingConfigurations as $cfgRec)     {
+                                       $this->indexSingleRecord($currentRecord,$cfgRec);
+                               }
+                       }
+               }
+       }
 }
 
 
index ad98101..d279494 100755 (executable)
@@ -351,7 +351,7 @@ class tx_indexedsearch_indexer {
         * @param       string          Keywords equivalent
         * @param       string          Description equivalent
         * @param       string          The main content to index
-        * @param       string          The charset of the title, keyword, description and body-content
+        * @param       string          The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
         * @param       integer         Last modification time, in seconds
         * @param       integer         The creation date of the content, in seconds
         * @param       integer         The record UID that the content comes from (for registration with the indexed rows)
index 56440e8..fb46194 100755 (executable)
@@ -1,35 +1,25 @@
-***************
-TODO / projects:
-***************
-
 Version 3:
-- Refactor "class.crawler.php"
-- Add timing controls
-- Hook + example for external clients
-- IndexCfg option:
-       - Exclude URLs (regex?), possibly with graphical interface selecting it from indexed content.
-       - Record-indexing: support languageField in records
-       - Crawling in the night (enter time for indexing configurations), but possible to force.
-       - New type: Page Tree (traversing a part at night like crawler allows us...)
-- Indexing configuration overview (including status and manual clearing possibility)
-- Set up test environment with CLI running
 
-Frontend:
-       - Searching in certain external site
-       - Mac "spotlight" like searching? (Define which categories are bundled and which not)
-       - Alternative presentationer af når records er indexerede.
+BUGS to solve before 4.0:
+       - The "crawler" extension: what if crawler-script doesn't end and clear the process-script?
+       - Indexing of TYPO3.org resulted in: links to wiki pages were wrong!
 
+TODO:
+       Frontend:
+               - Searching in certain external site
+               - Mac "spotlight" like searching? (Define which categories are bundled and which not)
+               - Alternative presentationer af når records er indexerede.
 
+***************
+TODO / projects:
+***************
 
-Testing indexing crawler for:
-       - 3DS
-       - TYPO3.org copy
-       - Metropol
-       - FI
-       - Link Factory
-       - Brunata
 
 **************
+- IndexCfg option:
+       - New type: Page Tree (traversing a part at night like crawler allows us...) [Basing this on the features of "crawler", simply adding log-entries]
+
+- Indexing configuration overview (including status and manual clearing possibility)
 
 Bugs / Issues:
 - The checkbox "No Search" in the page header is only respected by indexed_search during indexing! (A page will not be indexed when "No Search" is set). However when searching results are not filtered based on this flag - so if a page is indexed before the no search flag is set it will be found in search results. To change this is hard because the getTreeList() function that fetches all page ids cannot take a where-clause to filter it out but must have hardcoded support. Alternatively the pages table must be joined into the search result so we can select on the field. A solution is still not agreed upon.
index 1ecb0c0..77de35b 100755 (executable)
@@ -15,6 +15,9 @@ $TYPO3_CONF_VARS['SC_OPTIONS']['tslib/class.tslib_fe.php']['headerNoCache']['tx_
 $TYPO3_CONF_VARS['EXTCONF']['crawler']['procInstructions']['tx_indexedsearch_reindex'] = 'Re-indexing';
 $TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks']['tx_indexedsearch_crawl'] = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler';
 
+       // Register with TCEmain:
+$TYPO3_CONF_VARS['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['processDatamapClass']['tx_indexedsearch'] = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler';
+
        // Configure default document parsers:
 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] = array(
        'pdf' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
@@ -50,4 +53,8 @@ $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['pi1_hooks'] = array (
        // EXAMPLE of adding fields to root line:
 #$TYPO3_CONF_VARS['EXTCONF']['indexed_search']['addRootLineFields']['level3'] = 3;
 
+
+       // Example of crawlerhook (see also ext_tables.php!)
+$TYPO3_CONF_VARS['EXTCONF']['indexed_search']['crawler']['tx_myext_example1'] = 'EXT:indexed_search/example/class.crawlerhook.php:&tx_indexedsearch_crawlerhook';
+
 ?>
\ No newline at end of file
index 4696c27..fdaa28e 100755 (executable)
@@ -23,27 +23,34 @@ if (TYPO3_MODE=="BE")       {
 }
 
 t3lib_extMgm::allowTableOnStandardPages('index_config');
+t3lib_extMgm::addLLrefForTCAdescr('index_config','EXT:indexed_search/locallang_csh_indexcfg.xml');
 
+if (t3lib_extMgm::isLoaded('crawler')) {
+       $TCA['index_config'] = Array (
+               'ctrl' => Array (
+                       'title' => 'LLL:EXT:indexed_search/locallang_db.php:index_config',
+                       'label' => 'title',
+                       'tstamp' => 'tstamp',
+                       'crdate' => 'crdate',
+                       'cruser_id' => 'cruser_id',
+                       'type' => 'type',
+                       'default_sortby' => 'ORDER BY crdate',
+                       'enablecolumns' => Array (
+                               'disabled' => 'hidden',
+                               'starttime' => 'starttime',
+                       ),
+                       'dynamicConfigFile' => t3lib_extMgm::extPath($_EXTKEY).'tca.php',
+                       'iconfile' => 'default.gif',
+               ),
+               'feInterface' => Array (
+                       'fe_admin_fieldList' => 'hidden, starttime, title, description, type, depth, table2index, alternative_source_pid, get_params, chashcalc, filepath, extensions',
+               )
+       );
+}
 
-$TCA['index_config'] = Array (
-    'ctrl' => Array (
-        'title' => 'LLL:EXT:indexed_search/locallang_db.php:index_config',
-        'label' => 'title',
-        'tstamp' => 'tstamp',
-        'crdate' => 'crdate',
-        'cruser_id' => 'cruser_id',
-        'type' => 'type',
-        'default_sortby' => 'ORDER BY crdate',
-        'enablecolumns' => Array (
-            'disabled' => 'hidden',
-            'starttime' => 'starttime',
-        ),
-        'dynamicConfigFile' => t3lib_extMgm::extPath($_EXTKEY).'tca.php',
-        'iconfile' => 'default.gif',
-    ),
-    'feInterface' => Array (
-        'fe_admin_fieldList' => 'hidden, starttime, title, description, type, depth, table2index, alternative_source_pid, get_params, chashcalc, filepath, extensions',
-    )
-);
 
+       // Example of crawlerhook (see also ext_localconf.php!)
+t3lib_div::loadTCA('index_config');
+$TCA['index_config']['columns']['type']['config']['items'][] =  Array('My Crawler hook!', 'tx_myext_example1');
+$TCA['index_config']['types']['tx_myext_example1'] = $TCA['index_config']['types']['0'];
 ?>
\ No newline at end of file
index 169dd42..fb33a60 100755 (executable)
@@ -150,21 +150,18 @@ CREATE TABLE index_debug (
 CREATE TABLE index_config (
     uid int(11) DEFAULT '0' NOT NULL auto_increment,
     pid int(11) DEFAULT '0' NOT NULL,
-    tstamp int(11) unsigned DEFAULT '0' NOT NULL,
-    crdate int(11) unsigned DEFAULT '0' NOT NULL,
-    cruser_id int(11) unsigned DEFAULT '0' NOT NULL,
-    hidden tinyint(4) unsigned DEFAULT '0' NOT NULL,
-    starttime int(11) unsigned DEFAULT '0' NOT NULL,
+    tstamp int(11) DEFAULT '0' NOT NULL,
+    crdate int(11) DEFAULT '0' NOT NULL,
+    cruser_id int(11) DEFAULT '0' NOT NULL,
+    hidden tinyint(4) DEFAULT '0' NOT NULL,
+    starttime int(11) DEFAULT '0' NOT NULL,
 
     set_id int(11) DEFAULT '0' NOT NULL,
     session_data mediumtext NOT NULL,
-    first_run_time int(11) unsigned DEFAULT '0' NOT NULL,
-    frequency int(11) unsigned DEFAULT '0' NOT NULL,
-    last_run int(11) unsigned DEFAULT '0' NOT NULL,
 
     title tinytext NOT NULL,
     description text NOT NULL,
-    type int(11) unsigned DEFAULT '0' NOT NULL,
+    type varchar(30) DEFAULT '' NOT NULL,
     depth int(11) unsigned DEFAULT '0' NOT NULL,
     table2index tinytext NOT NULL,
     alternative_source_pid blob NOT NULL,
@@ -175,6 +172,13 @@ CREATE TABLE index_config (
     filepath tinytext NOT NULL,
     extensions tinytext NOT NULL,
 
+       timer_next_indexing int(11) DEFAULT '0' NOT NULL,
+       timer_frequency int(11) DEFAULT '0' NOT NULL,
+       timer_offset int(11) DEFAULT '0' NOT NULL,
+       url_deny text NOT NULL,
+       recordsbatch int(11) DEFAULT '0' NOT NULL,
+       records_indexonchange tinyint(4) DEFAULT '0' NOT NULL,
+
     PRIMARY KEY (uid),
     KEY parent (pid)
 );
diff --git a/typo3/sysext/indexed_search/locallang_csh_indexcfg.xml b/typo3/sysext/indexed_search/locallang_csh_indexcfg.xml
new file mode 100755 (executable)
index 0000000..07a20dd
--- /dev/null
@@ -0,0 +1,115 @@
+<?xml version="1.0" encoding="utf-8" standalone="yes" ?>
+<T3locallang>
+       <meta type="array">
+               <description>CSH for Indexing Configurations</description>
+               <type>CSH</type>
+               <fileId>EXT:indexed_search/locallang_csh_indexcfg.xml</fileId>
+               <csh_table>index_config</csh_table>
+               <keep_original_text>1</keep_original_text>
+               <ext_filename_template>EXT:csh_###LANGKEY###/indexed_search/###LANGKEY###.locallang_csh_indexcfg.xml</ext_filename_template>
+               <labelContext type="array">
+               </labelContext>
+       </meta>
+       <data type="array">
+               <languageKey index="default" type="array">
+                       <label index=".description">The Indexed Search configuration allows you to set up indexing of resources which are external to the TYPO3 page tree. This could be a library of records (like news), a directory structure of files (like PDFs) or an external URL to be crawled to some depth - or third-party content provided from other extensions via hooks.</label>
+                       <label index=".details">The Indexing Configuration records are used to set up how the indexing occurs, at which time and related to which part of the page tree. An important requirement to keep in mind is that the &quot;crawler&quot; extension must be installed since the indexing engine uses the &quot;crawler&quot; extensions queue as a timing device - and using the &quot;crawler&quot; extension means using a cron-job to activate it from the shell.</label>
+                       <label index="title.description">Provide a title for the configuration.</label>
+                       <label index="description.description">If needed, give it a description or put other important notes in this field.</label>
+                       <label index="hidden.description">Disable the configuration by this checkbox. If disabled, the configuration will be bypassed by the crawlers cron job.</label>
+                       <label index="hidden.details">Notice; Disabling the indexing process will not hide already indexed content in search results!</label>
+                       <label index="_hidden.seeAlso">index_config:starttime</label>
+                       <label index="starttime.description">Set a start time for the configuration. If set, the configuration will be bypassed by the crawlers cron job until this time is reached.</label>
+                       <label index="_starttime.seeAlso">index_config:hidden</label>
+                       <label index="timer_frequency.description">Use this setting to adjust how often you would like a re-indexing process to run for the configuration.</label>
+                       <label index="_timer_frequency.seeAlso">index_config:timer_next_indexing, 
+index_config:timer_offset</label>
+                       <label index="timer_offset.description">The offset adjusts at what time of day you want the indexing process to run.</label>
+                       <label index="_timer_offset.seeAlso">index_config:timer_frequency</label>
+                       <label index="timer_next_indexing.description">This field contains the date and time of the next occuring indexing session. If left blank the next indexing will occur as soon as the cron-script is run again (typically within the next minute) and afterwards set to the next time according to frequency and offset.</label>
+                       <label index="_timer_next_indexing.seeAlso">index_config:timer_offset, 
+index_config:timer_frequency</label>
+                       <label index="type.description">Select the type of Indexing Configuration here.</label>
+                       <label index="type.details">&lt;b&gt;Records&lt;/b&gt; - You will be able to index records on a specified page from a specified database table. You can determine which fields from that record you want to index and how additional GET parameters should look like when linking to the search result.
+&lt;b&gt;Files&lt;/b&gt; - Allows you to index a whole directory of files from the fileadmin/ folder. You can optionally specify how many levels of recursion you want.
+&lt;b&gt;External URL&lt;/b&gt; - Allows you to index an external site and search it from your TYPO3 installation! You can specify how deep the crawler should go on that external URL.
+&lt;b&gt;Custom&lt;/b&gt; - Other extensions might register custom types of configurations.</label>
+                       <label index="table2index.description">Specify the database table to index records from.</label>
+                       <label index="alternative_source_pid.description">By default the indexer will select records from the page id where the Indexing Configuration is stored. If you want an alternative page as the source of your records you can enter it here.</label>
+                       <label index="_alternative_source_pid.seeAlso">index_config:table2index</label>
+                       <label index="get_params.description">Enter the GET parameters necessary to display the search results. You can use ###UID### as a marker in the parameter template.</label>
+                       <label index="get_params.details">Notice: The GET parameters are appended to the URL of the page where the Indexing Configuration is stored (which must be the page where a plugin exists that can display the records with the given parameters!).
+Example value: &quot;&amp;showUid=###UID###&quot;</label>
+                       <label index="_get_params.seeAlso">index_config:table2index</label>
+                       <label index="fieldlist.description">Enter a comma-list of fields to be indexed. The first field name will be used for the search result title.</label>
+                       <label index="fieldlist.details">&lt;b&gt;Example:&lt;/b&gt; &quot;header,bodytext,image_caption&quot; would index the Header, Bodytext and Image Caption fields from &quot;tt_content&quot;. Notice that you must use the real database field names, don't spell them wrong!</label>
+                       <label index="_fieldlist.seeAlso">index_config:table2index</label>
+                       <label index="chashcalc.description">If checked, the URL in the search result for records will be encoded with a &quot;&amp;cHash&quot; parameter to make it cachable. Use only if supported by the plugin!</label>
+                       <label index="_chashcalc.seeAlso">index_config:table2index</label>
+                       <label index="recordsbatch.description">Enter how many records to index for each instance of the indexing process (how many per minute). Default is 100 records.</label>
+                       <label index="_recordsbatch.seeAlso">index_config:table2index</label>
+                       <label index="records_indexonchange.description">If set, new and changed records from this table will be indexed through a hook in the core (TCEmain) meaning that they will be searchable immediately after addition to the system.</label>
+                       <label index="_records_indexonchange.seeAlso">index_config:table2index</label>
+                       <label index="externalUrl.description">Enter the URL of the external website you want to index.</label>
+                       <label index="url_deny.description">Enter a URL on each line inside of which the crawler should not decend.</label>
+                       <label index="url_deny.details">Example: 
+If you wish to index &quot;http://typo3.org/&quot; but not &quot;http://typo3.org/extensions/&quot; and &quot;http://typo3.org/downloads/&quot; then you simply enter those two URLs into the box like this:
+
+http://typo3.org/downloads/
+http://typo3.org/extensions/</label>
+                       <label index="_url_deny.seeAlso">index_config:externalUrl</label>
+                       <label index="filepath.description">Enter a filepath inside fileadmin/ where you want the files to be indexed.</label>
+                       <label index="filepath.details">Example: &quot;fileadmin/newsletters/&quot;</label>
+                       <label index="extensions.description">If you want to index only specific files in the directory, enter a list of file extensions here. For example: &quot;html,pdf,doc&quot;</label>
+                       <label index="_extensions.seeAlso">index_config:filepath</label>
+                       <label index="depth.description">Levels of recursion.</label>
+                       <label index="depth.details">For file indexing it is the level of directories to decend. For External URLs it's the depth to which the crawler will go with links.</label>
+                       <label index="_depth.seeAlso">index_config:externalUrl, 
+index_config:filepath</label>
+               </languageKey>
+               <languageKey index="dk">EXT:csh_dk/indexed_search/dk.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="de">EXT:csh_de/indexed_search/de.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="no">EXT:csh_no/indexed_search/no.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="it">EXT:csh_it/indexed_search/it.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="fr">EXT:csh_fr/indexed_search/fr.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="es">EXT:csh_es/indexed_search/es.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="nl">EXT:csh_nl/indexed_search/nl.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="cz">EXT:csh_cz/indexed_search/cz.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="pl">EXT:csh_pl/indexed_search/pl.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="si">EXT:csh_si/indexed_search/si.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="fi">EXT:csh_fi/indexed_search/fi.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="tr">EXT:csh_tr/indexed_search/tr.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="se">EXT:csh_se/indexed_search/se.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="pt">EXT:csh_pt/indexed_search/pt.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="ru">EXT:csh_ru/indexed_search/ru.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="ro">EXT:csh_ro/indexed_search/ro.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="ch">EXT:csh_ch/indexed_search/ch.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="sk">EXT:csh_sk/indexed_search/sk.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="lt">EXT:csh_lt/indexed_search/lt.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="is">EXT:csh_is/indexed_search/is.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="hr">EXT:csh_hr/indexed_search/hr.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="hu">EXT:csh_hu/indexed_search/hu.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="gl">EXT:csh_gl/indexed_search/gl.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="th">EXT:csh_th/indexed_search/th.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="gr">EXT:csh_gr/indexed_search/gr.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="hk">EXT:csh_hk/indexed_search/hk.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="eu">EXT:csh_eu/indexed_search/eu.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="bg">EXT:csh_bg/indexed_search/bg.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="br">EXT:csh_br/indexed_search/br.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="et">EXT:csh_et/indexed_search/et.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="ar">EXT:csh_ar/indexed_search/ar.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="he">EXT:csh_he/indexed_search/he.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="ua">EXT:csh_ua/indexed_search/ua.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="lv">EXT:csh_lv/indexed_search/lv.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="jp">EXT:csh_jp/indexed_search/jp.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="vn">EXT:csh_vn/indexed_search/vn.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="ca">EXT:csh_ca/indexed_search/ca.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="ba">EXT:csh_ba/indexed_search/ba.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="kr">EXT:csh_kr/indexed_search/kr.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="eo">EXT:csh_eo/indexed_search/eo.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="my">EXT:csh_my/indexed_search/my.locallang_csh_indexcfg.xml</languageKey>
+               <languageKey index="hi">EXT:csh_hi/indexed_search/hi.locallang_csh_indexcfg.xml</languageKey>
+       </data>
+       <orig_text type="array">
+       </orig_text>
+</T3locallang>
\ No newline at end of file
index 7dc339d..40b4411 100755 (executable)
                        <label index="index_config.alternative_source_pid">Alternative Source Page:</label>
                        <label index="index_config.get_params">GET parameter string (with ###UID### substitution):</label>
                        <label index="index_config.fields">Fields (first is title):</label>
-                       <label index="index_config.externalUrl">External URL:</label>
+                       <label index="index_config.externalUrl">External URL (eg. "http://www.domain.org/"):</label>
                        <label index="index_config.chashcalc">Calculate cHash (force caching)?</label>
                        <label index="index_config.filepath">Filepath:</label>
                        <label index="index_config.extensions">Limit to extensions (commalist):</label>
+
+                       <label index="index_config.url_deny">Enter sub-URLs in which not to decend:</label>
+                       <label index="index_config.records_indexonchange">Index Records immediately when saved?:</label>
+                       <label index="index_config.timer_next_indexing">Next indexing is scheduled (if empty, then immediately):</label>
+                       <label index="index_config.timer_offset">Timing offset from midnight:</label>
+                       <label index="index_config.timer_frequency">How often would you like a re-index?:</label>
+                       <label index="index_config.timer_frequency.I.0">Every hour</label>
+                       <label index="index_config.timer_frequency.I.1">Every day (24 hours)</label>
+                       <label index="index_config.timer_frequency.I.2">Every week</label>
+                       <label index="index_config.recordsbatch">How many records to index a minute (default is 100):</label>
                </languageKey>
                <languageKey index="dk">EXT:csh_dk/indexed_search/dk.locallang_db.xml</languageKey>
                <languageKey index="de">EXT:csh_de/indexed_search/de.locallang_db.xml</languageKey>
index 3172230..d77ebb4 100644 (file)
@@ -134,12 +134,79 @@ $TCA['index_config'] = Array (
                 'size' => '30',
             )
         ),
+        'url_deny' => Array (
+            'label' => 'LLL:EXT:indexed_search/locallang_db.php:index_config.url_deny',
+            'config' => Array (
+                'type' => 'text',
+                'cols' => '30',
+                'rows' => '2',
+            )
+        ),
+        'records_indexonchange' => Array (
+            'label' => 'LLL:EXT:indexed_search/locallang_db.php:index_config.records_indexonchange',
+            'config' => Array (
+                'type' => 'check',
+                'default' => '0',
+            )
+        ),
+        'timer_next_indexing' => Array (
+            'label' => 'LLL:EXT:indexed_search/locallang_db.php:index_config.timer_next_indexing',
+            'config' => Array (
+                'type' => 'input',
+                'size' => '12',
+                'max' => '20',
+                'eval' => 'datetime',
+                'default' => '0',
+                'checkbox' => '0'
+            )
+        ),
+        'timer_offset' => Array (
+            'label' => 'LLL:EXT:indexed_search/locallang_db.php:index_config.timer_offset',
+            'config' => Array (
+                'type' => 'input',
+                'size' => '8',
+                'max' => '20',
+                'eval' => 'time',
+                'default' => 3600,
+            )
+        ),
+        'timer_frequency' => Array (
+            'label' => 'LLL:EXT:indexed_search/locallang_db.php:index_config.timer_frequency',
+            'config' => Array (
+                'type' => 'select',
+                'items' => Array (
+                    Array('LLL:EXT:indexed_search/locallang_db.php:index_config.timer_frequency.I.0', '3600'),
+                    Array('LLL:EXT:indexed_search/locallang_db.php:index_config.timer_frequency.I.1', '86400'),
+                    Array('LLL:EXT:indexed_search/locallang_db.php:index_config.timer_frequency.I.2', '604800'),
+                ),
+                'size' => 1,
+                'maxitems' => 1,
+                'default' => 86400,
+            )
+        ),
+        'recordsbatch' => Array (
+            'label' => 'LLL:EXT:indexed_search/locallang_db.php:index_config.recordsbatch',
+            'config' => Array (
+                'type' => 'input',
+                'size' => '8',
+                'max' => '20',
+                'eval' => 'int',
+                'default' => '0',
+                'checkbox' => '0'
+            )
+        ),
+        'set_id' => Array (
+            'label' => 'Session ID (if > zero, then indexing job is running):',
+            'config' => Array (
+                'type' => 'none',
+            )
+        ),
     ),
     'types' => Array (
-        '0' => Array('showitem' => 'title;;1;;2-2-2, description, type'),
-        '1' => Array('showitem' => 'title;;1;;2-2-2, description, type;;;;3-3-3, table2index;;;;3-3-3, alternative_source_pid, fieldlist, get_params, chashcalc'),
-        '2' => Array('showitem' => 'title;;1;;2-2-2, description, type;;;;3-3-3, filepath;;;;3-3-3, extensions, depth'),
-        '3' => Array('showitem' => 'title;;1;;2-2-2, description, type;;;;3-3-3, externalUrl;;;;3-3-3, depth'),
+        '0' => Array('showitem' => 'title;;1;;2-2-2, description, timer_next_indexing, timer_offset, timer_frequency, set_id, type;;;;3-3-3'),
+        '1' => Array('showitem' => 'title;;1;;2-2-2, description, timer_next_indexing, timer_offset, timer_frequency, set_id, type;;;;3-3-3, table2index;;;;3-3-3, alternative_source_pid, fieldlist, get_params, chashcalc,recordsbatch,records_indexonchange'),
+        '2' => Array('showitem' => 'title;;1;;2-2-2, description, timer_next_indexing, timer_offset, timer_frequency, set_id, type;;;;3-3-3, filepath;;;;3-3-3, extensions, depth'),
+        '3' => Array('showitem' => 'title;;1;;2-2-2, description, timer_next_indexing, timer_offset, timer_frequency, set_id, type;;;;3-3-3, externalUrl;;;;3-3-3, depth, url_deny'),
     ),
     'palettes' => Array (
         '1' => Array('showitem' => 'starttime,hidden')