See Changelog: Updates to Indexed Search (mainly), t3lib_cs (bug), t3lib_tcemain...
authorKasper Skårhøj <kasper@typo3.org>
Sun, 28 Nov 2004 20:09:29 +0000 (20:09 +0000)
committerKasper Skårhøj <kasper@typo3.org>
Sun, 28 Nov 2004 20:09:29 +0000 (20:09 +0000)
git-svn-id: https://svn.typo3.org/TYPO3v4/Core/trunk@515 709f56b5-9817-0410-a4d7-c38de5d9e867

43 files changed:
ChangeLog
TODO.txt
t3lib/class.t3lib_befunc.php
t3lib/class.t3lib_cs.php
t3lib/class.t3lib_foldertree.php
t3lib/class.t3lib_tcemain.php
t3lib/class.t3lib_tsparser_ext.php
t3lib/config_default.php
typo3/alt_db_navframe.php
typo3/mod/tools/em/index.php
typo3/sysext/cms/tslib/class.tslib_content.php
typo3/sysext/cms/tslib/class.tslib_fe.php
typo3/sysext/indexed_search/class.doublemetaphone.php [new file with mode: 0755]
typo3/sysext/indexed_search/class.external_parser.php [new file with mode: 0755]
typo3/sysext/indexed_search/class.indexer.php
typo3/sysext/indexed_search/class.lexer.php [new file with mode: 0755]
typo3/sysext/indexed_search/cli/conf.php [new file with mode: 0644]
typo3/sysext/indexed_search/cli/indexer_cli.phpsh [new file with mode: 0755]
typo3/sysext/indexed_search/doc/TODO.txt
typo3/sysext/indexed_search/doc/manual.sxw
typo3/sysext/indexed_search/ext_conf_template.txt
typo3/sysext/indexed_search/ext_emconf.php
typo3/sysext/indexed_search/ext_localconf.php
typo3/sysext/indexed_search/ext_tables.php
typo3/sysext/indexed_search/ext_tables.sql
typo3/sysext/indexed_search/locallang_db.xml [new file with mode: 0755]
typo3/sysext/indexed_search/mod/index.php
typo3/sysext/indexed_search/modfunc1/class.tx_indexedsearch_modfunc1.php
typo3/sysext/indexed_search/pi/class.tx_indexedsearch.php
typo3/sysext/indexed_search/pi/res/csv.gif [new file with mode: 0755]
typo3/sysext/indexed_search/pi/res/jpg.gif [new file with mode: 0755]
typo3/sysext/indexed_search/pi/res/pps.gif [new file with mode: 0644]
typo3/sysext/indexed_search/pi/res/ppt.gif [new file with mode: 0644]
typo3/sysext/indexed_search/pi/res/rtf.gif [new file with mode: 0755]
typo3/sysext/indexed_search/pi/res/sxc.gif [new file with mode: 0644]
typo3/sysext/indexed_search/pi/res/sxi.gif [new file with mode: 0644]
typo3/sysext/indexed_search/pi/res/sxw.gif [new file with mode: 0644]
typo3/sysext/indexed_search/pi/res/tif.gif [new file with mode: 0755]
typo3/sysext/indexed_search/pi/res/xls.gif [new file with mode: 0644]
typo3/sysext/indexed_search/pi/res/xml.gif [new file with mode: 0755]
typo3/sysext/indexed_search/tca.php [new file with mode: 0644]
typo3/sysext/setup/mod/index.php
typo3/template.php

index cdb7335..fccaf94 100755 (executable)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2004-11-28  Kasper Skårhøj,,,  <kasper@typo3.com>
+
+       * Main feature: Lots of updates on Indexed Search extension. The changes are mainly in the indexer, not the search plugin. The work is NOT FINISHED yet and don't update a production site with this work! One main thing to be aware of is that all indexing is done internally as utf-8. You should flush your old index tables before running the new one.
+       * Fixed bug in t3lib_cs::utf8_strtrunc() (or so)... do'h Martin!
+       * Fixed bug (spelling mistake) in the hook "processDatamap_preProcessFieldArray"
+       * Fixed order of configuration forms in Extension Manager
+       * Added timezone option in TYPO3_CONF_VARS array
+       * Added right-click feature on context menus. Can be disabled with TYPO3_CONF_VARS if you don't like it. And a rightclick on the page/folder _title_ will also activate the menu! Theoretically it is not valid XHTML. Works in Mozilla and MSIE. Thanks Wolfgang!
+       * Added TS option "USERUID_substToken" 
+
 2004-11-26  Michael Stucki  <michael@typo3.org>
 
        * Fixed bug #0000527: Title tag is added even if the page title was empty. Thanks to Hannes Schmid.
index 4829ba7..05c8ff9 100755 (executable)
--- a/TODO.txt
+++ b/TODO.txt
@@ -135,6 +135,7 @@ TCEforms:
                - "readonly" flag, or user group dependant. See "Message-Id: <200210241441.50295.r.fritz@colorcube.de>"
        - ? type:
                - "Inverse relations"/"Foreign relations": A "Pseudo field", which lists records REFERING TO this record (foreign relations, eg. many small price-records belonging to ONE shop-article). Possibly this could also EDIT those references (attaching/adding new, removing old, no manual ordering though! - This is what RENE is doing (Message-Id: <E17LO4D-0002hj-00@cube.colorcube>)
+       - Concealed password fields, support for two fields with the same password being submitted... (JavaScript evaluated?)
 - BUGS:
        - Ask to save record when you want to add a category with the "Plus" icon.
        - CHECK: ###STORAGE_PID### incorrectly calculated?
@@ -747,6 +748,17 @@ General Crawler ("crawler"):
        - From session log we can read out the status-arrays of the threads and display in backend (handler shows as it likes.)
        - Start / stop crawler session
 
+For indexing (and caching and publishing!) we basically need to configure traversal through configurable paramters like:
+       - id
+       - type
+       - L (sys_language)
+       - MP (?)
+       - Simulate user logins (eg. sending "no-login", "user:kasper, password=blabla", "user:homer, password=blabla2")
+       - Additional parameter ranges per page (eg. "tx_myext[var1] : 1-7   AND tx_myext[var1] : 0,1 ")
+       - Command parameter like "&DO_INDEX=1" or "&RECACHE=1" (or both!)
+
+       - Use CLI script for crawler?
+
 __________________________________________-
 CACHE MANAGEMENT EXTENSION
 
index c805dc8..5906d3e 100755 (executable)
@@ -519,7 +519,7 @@ class t3lib_BEfunc  {
        function BEgetRootLine($uid,$clause='') {
                $loopCheck = 100;
                $theRowArray = Array();
-               $output=Array();
+               $output = Array();
                while ($uid!=0 && $loopCheck>0) {
                        $loopCheck--;
                        $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery(
@@ -540,7 +540,7 @@ class t3lib_BEfunc  {
                                break;
                        }
                }
-               if ($uid==0) {$theRowArray[]=Array('uid'=>0,'title'=>'');}
+               if ($uid==0) {$theRowArray[] = Array('uid'=>0,'title'=>'');}
                if (is_array($theRowArray))     {
                        reset($theRowArray);
                        $c=count($theRowArray);
index 7d95fef..bb2e86b 100755 (executable)
@@ -1680,7 +1680,7 @@ class t3lib_cs {
                        if ($bc+$i > $len)      return substr($str,0,$i);
                         // fallthru: multibyte char fits into length
                }
-               return substr($str,$len);
+               return substr($str,0,$len);
        }
 
        /**
index 9c57ddb..045a39e 100755 (executable)
@@ -118,6 +118,24 @@ class t3lib_folderTree extends t3lib_treeView  {
        }
 
        /**
+        * Wrapping $title in a-tags.
+        *
+        * @param       string          Title string
+        * @param       string          Item record
+        * @param       integer         Bank pointer (which mount point number)
+        * @return      string
+        * @access private
+        */
+       function wrapTitle($title,$row,$bank=0) {
+               $aOnClick = 'return jumpTo(\''.$this->getJumpToParam($row).'\',this,\''.$this->domIdPrefix.$this->getId($row).'_'.$bank.'\');';
+               $CSM = '';
+               if ($GLOBALS['TYPO3_CONF_VARS']['BE']['useOnContextMenuHandler'])       {
+                       $CSM = ' oncontextmenu="'.htmlspecialchars($GLOBALS['TBE_TEMPLATE']->wrapClickMenuOnIcon('',$row['path'],'',0,'','',TRUE)).'"';
+               }
+               return '<a href="#" onclick="'.htmlspecialchars($aOnClick).'"'.$CSM.'>'.$title.'</a>';
+       }
+
+       /**
         * Returns the id from the record - for folders, this is an md5 hash.
         *
         * @param       array           Record array
index f94ab01..546ccf2 100755 (executable)
@@ -477,7 +477,7 @@ class t3lib_TCEmain {
 
                                                        // Hook: processDatamap_preProcessIncomingFieldArray
                                                foreach($hookObjectsArr as $hookObj)    {
-                                                       if (method_exists($hookObj, 'processDatamap_preProcessIncomingFieldArray')) {
+                                                       if (method_exists($hookObj, 'processDatamap_preProcessFieldArray')) {
                                                                $hookObj->processDatamap_preProcessFieldArray($incomingFieldArray, $table, $id, $this);
                                                        }
                                                }
index 980e307..f404f22 100755 (executable)
@@ -175,6 +175,7 @@ class t3lib_tsparser_ext extends t3lib_TStemplate   {
        var $ext_printAll=0;
        var $ext_CEformName="forms[0]";
        var $ext_defaultOnlineResourceFlag=0;
+       var $doNotSortCategoriesBeforeMakingForm = FALSE;
 
                // ts analyzer
        var $templateTitles=array();
@@ -1086,7 +1087,7 @@ class t3lib_tsparser_ext extends t3lib_TStemplate {
                        $help=$this->helpConfig;
                        $this->rArr=explode(",",$this->setup["resources"].",".implode($this->dirResources,","));
 
-                       asort($this->categories[$category]);
+                       if (!$this->doNotSortCategoriesBeforeMakingForm) asort($this->categories[$category]);
                        while(list($name,$type)=each($this->categories[$category]))     {
                                $params = $theConstants[$name];
                                if (is_array($params))  {
index fc3f3ee..8b1b4ba 100755 (executable)
@@ -75,6 +75,7 @@ $TYPO3_CONF_VARS = Array(
                'multiplyDBfieldSize' => 1,                             // Double: 1-5: Amount used to multiply the DB field size when the install tool is evaluating the database size (eg. "2.5"). This is useful if you want to expand the size of fields for utf-8 etc. For western european sites using utf-8 the need should not be for more than twice the normal single-byte size (2) and for chinese / asian languages 3 should suffice.
                'setMemoryLimit' => 0,                                  // Integer, memory_limit in MB: If more than 16, TYPO3 will try to use ini_set() to set the memory limit of PHP to the value. This works only if the function ini_set() is not disabled by your sysadmin.
                'displayErrors' => 0,                                   // Integer, -1,0,1. 0=Do not display any PHP error messages. 1=Display error messages. -1=Default setting. With this option, you can override the PHP setting "display_errors". It is suggested that you leave this unchanged but enable the "error_log" option in php.ini instead.
+               'serverTimeZone' => 1                                   // Integer, GMT offset of servers time (from time()). Default is "1" which is "GMT+1" (central european time). This value can be used in extensions that are GMT aware and wants to convert times to/from other timezones.
        ),
        'EXT' => Array (        // Options related to the Extension Management
                'noEdit' => 1,                                                  // Boolean: If set, the Extension Manager does NOT allow extension files to be edited! (Otherwise both local and global extensions can be edited.)
@@ -144,6 +145,7 @@ $TYPO3_CONF_VARS = Array(
                'customPermOptions' => array(),                 // Array with sets of custom permission options. Syntax is; 'key' => array('header' => 'header string, language splitted', 'items' => array('key' => array('label, language splitted', 'icon reference', 'Description text, language splitted'))). Keys cannot contain ":|," characters.
                'fileDenyPattern' => '\.php$|\.php.$',  // A regular expression that - if it matches a filename - will deny the file upload/rename or whatever in the webspace. Matching with eregi() (case-insensitive).
                'interfaces' => 'backend',                                      // This determines which interface options is available in the login prompt and in which order (All options: ",backend,frontend")
+               'useOnContextMenuHandler' => 1,                 // Boolean. If set, the context menus (clickmenus) in the backend are activated on right-click - although this is not a XHTML attribute!
                'loginLabels' => 'Username|Password|Interface|Log In|Log Out|Backend,Front End|Administration Login on ###SITENAME###|(Note: Cookies and JavaScript must be enabled!)|Important Messages:|Your login attempt did not succeed. Make sure to spell your username and password correctly, including upper/lowercase characters.',          // Language labels of the login prompt.
                'loginNews' => array(),                                         // In this array you can define news-items for the login screen. To this array, add arrays with assoc keys 'date', 'header', 'content' (HTML content) and for those appropriate value pairs
                'XCLASS' => Array(),                                    // See 'Inside TYPO3' document for more information.
@@ -360,4 +362,4 @@ unset($LOCAL_LANG);
        // Setting some global vars:
 $EXEC_TIME = time();                                   // $EXEC_TIME is set so that the rest of the script has a common value for the script execution time
 $SIM_EXEC_TIME = $EXEC_TIME;                   // $SIM_EXEC_TIME is set to $EXEC_TIME but can be altered later in the script if we want to simulate another execution-time when selecting from eg. a database
-?>
\ No newline at end of file
+?>
index fd65d40..bf34272 100755 (executable)
@@ -139,6 +139,24 @@ class localPageTree extends t3lib_browseTree {
                }
                return $str;
        }
+
+       /**
+        * Wrapping $title in a-tags.
+        *
+        * @param       string          Title string
+        * @param       string          Item record
+        * @param       integer         Bank pointer (which mount point number)
+        * @return      string
+        * @access private
+        */
+       function wrapTitle($title,$row,$bank=0) {
+               $aOnClick = 'return jumpTo(\''.$this->getJumpToParam($row).'\',this,\''.$this->domIdPrefix.$this->getId($row).'_'.$bank.'\');';
+               $CSM = '';
+               if ($GLOBALS['TYPO3_CONF_VARS']['BE']['useOnContextMenuHandler'])       {
+                       $CSM = ' oncontextmenu="'.htmlspecialchars($GLOBALS['TBE_TEMPLATE']->wrapClickMenuOnIcon('','pages',$row['uid'],0,'&bank='.$this->bank,'',TRUE)).'"';
+               }
+               return '<a href="#" onclick="'.htmlspecialchars($aOnClick).'"'.$CSM.'>'.$title.'</a>';
+       }
 }
 
 
index af13faa..ec47dd5 100755 (executable)
@@ -3670,6 +3670,7 @@ EXTENSION KEYS:
 
                                // Load tsStyleConfig class and parse configuration template:
                        $tsStyleConfig = t3lib_div::makeInstance('t3lib_tsStyleConfig');
+                       $tsStyleConfig->doNotSortCategoriesBeforeMakingForm = TRUE;
                        $theConstants = $tsStyleConfig->ext_initTSstyleConfig(
                                t3lib_div::getUrl($absPath.'ext_conf_template.txt'),
                                $relPath,
index f13821e..f248dd2 100755 (executable)
@@ -249,11 +249,11 @@ if(t3lib_extMgm::isLoaded('obts')) {
  */
 class tslib_cObj {
        var $align = Array ('center', 'right', 'left');
-       var $caseConvStrings = array(
-                       'áéúíâêûôîæøåäöü',
-                       'ÁÉÚÍÄËÜÖÏÆØÅÄÖÜ'
+/*     var $caseConvStrings = array(
+                       '�������',
+                       '��������
                );
-
+*/
        /**
         * Holds ImageMagick parameters and extensions used for compression
         *
@@ -3471,7 +3471,7 @@ class tslib_cObj {
         * @param       array           TypoScript configuration.
         * @return      string          Return string
         * @author      Thomas Bley (all from moregroupware cvs code / readmessage.inc.php, published under gpl by Thomas)
-        * @author      Kasper Skårhøj
+        * @author      Kasper Sk�hj
         */
        function removeBadHTML($text, $conf)    {
 
@@ -3749,7 +3749,7 @@ class tslib_cObj {
                        }
                }
                $locationData = $GLOBALS['TSFE']->id.':'.$this->currentRecord;
-               $rec='&locationData='.$locationData;
+               $rec='&locationData='.rawurlencode($locationData);
                $hArr = array(
                        $jumpUrl,
                        $locationData,
@@ -6999,7 +6999,7 @@ class tslib_controlTable  {
        var $cMt = 0;           // content margin, top
        var $cMb = 1;           // content margin, bottom
 
-       var $contentW = 0;      // sætter en lille gif-spacer nedest i content-framen
+       var $contentW = 0;      // ster en lille gif-spacer nedest i content-framen
 
        var $tableParams = 'border="0" cellspacing="0" cellpadding="0"';
 
@@ -7032,7 +7032,7 @@ class tslib_controlTable  {
                if ($this->bm) $rows++;
                if ($this->content) $rows++;
                if ($this->contentW) $rows++;
-               if (!$rows && $cols) $rows=1;           // hvis der slet ingen rækker er sat i midten men der trods alt er nogle kolonner
+               if (!$rows && $cols) $rows=1;           // hvis der slet ingen rker er sat i midten men der trods alt er nogle kolonner
 
                if ($rows&&$cols)       {
                        $res = chr(10).'<table '.$this->tableParams.'>';
@@ -7064,7 +7064,7 @@ class tslib_controlTable  {
                        if ($this->rm)  {       $res.='<td'.$rowspan.' '.$this->rmTDparams.'>'.$this->rm.'</td>';               }
                        $res.= '</tr>';
 
-                               // flere end de 2 rækker
+                               // flere end de 2 rker
                        $mCount = count($middle);
                        for($a=1;$a<$mCount;$a++)       {
                                $res.='<tr>'.$middle[$a].'</tr>';
index 3446bd6..1b43f45 100755 (executable)
                $this->no_cache = $no_cache ? 1 : 0;
                $this->cHash = $cHash;
                $this->jumpurl = $jumpurl;
-               $this->MP = $this->TYPO3_CONF_VARS['FE']['enable_mount_pids'] ? $MP : '';
+               $this->MP = $this->TYPO3_CONF_VARS['FE']['enable_mount_pids'] ? (string)$MP : '';
                $this->RDCT = $RDCT;
                $this->clientInfo = t3lib_div::clientInfo();
                $this->uniqueString=md5(microtime());
                        $GLOBALS['TT']->pull();
                        $GLOBALS['TT']->push('Cache Row','');
                                if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
-                                       $this->config = unserialize($row['cache_data']);                // Fetches the lowlevel config stored with the cached data
+                                       $this->config = (array)unserialize($row['cache_data']);         // Fetches the lowlevel config stored with the cached data
                                        $this->content = $row['HTML'];  // Getting the content
                                        $this->cacheContentFlag=1;      // Setting flag, so we know, that some cached content is gotten.
 
                                'all' => $this->all,
                                'id' => intval($this->id),
                                'type' => intval($this->type),
-                               'gr_list' => $this->gr_list,
-                               'MP' => $this->MP,
+                               'gr_list' => (string)$this->gr_list,
+                               'MP' => (string)$this->MP,
                                'cHash' => $this->cHash_array
                        )
                );
@@ -2389,8 +2389,16 @@ if (version == "n3") {
        function processOutput()        {
                        // Substitutes username mark with the username
                if ($this->fe_user->user['uid'])        {
+
+                               // User name:
                        $token = trim($this->config['config']['USERNAME_substToken']);
                        $this->content = str_replace($token ? $token : '<!--###USERNAME###-->',$this->fe_user->user['username'],$this->content);
+
+                               // User uid (if configured):
+                       $token = trim($this->config['config']['USERUID_substToken']);
+                       if ($token)     {
+                               $this->content = str_replace($token, $this->fe_user->user['uid'], $this->content);
+                       }
                }
                        // Substitutes get_URL_ID in case of GET-fallback
                if ($this->getMethodUrlIdToken) {
diff --git a/typo3/sysext/indexed_search/class.doublemetaphone.php b/typo3/sysext/indexed_search/class.doublemetaphone.php
new file mode 100755 (executable)
index 0000000..856218f
--- /dev/null
@@ -0,0 +1,1014 @@
+<?php
+// VERSION DoubleMetaphone Class 1.01
+//
+// DESCRIPTION
+//
+//   This class implements a "sounds like" algorithm developed
+//   by Lawrence Philips which he published in the June, 2000 issue
+//   of C/C++ Users Journal.  Double Metaphone is an improved
+//   version of Philips' original Metaphone algorithm.
+//
+// COPYRIGHT
+//
+//   Copyright 2001, Stephen Woodbridge <woodbri@swoodbridge.com>
+//   All rights reserved.
+//
+//   http://swoodbridge.com/DoubleMetaPhone/
+//
+//   This PHP translation is based heavily on the C implementation
+//   by Maurice Aubrey <maurice@hevanet.com>, which in turn
+//   is based heavily on the C++ implementation by
+//   Lawrence Philips and incorporates several bug fixes courtesy
+//   of Kevin Atkinson <kevina@users.sourceforge.net>.
+//
+//   This module is free software; you may redistribute it and/or
+//   modify it under the same terms as Perl itself.
+//
+// CONTRIBUTIONS
+//
+//   17-May-2002 Geoff Caplan  http://www.advantae.com
+//     Bug fix: added code to return class object which I forgot to do
+//     Created a functional callable version instead of the class version
+//     which is faster if you are calling this a lot.
+//
+// ------------------------------------------------------------------
+
+
+
+// TYPO3: Had to change name to "user_DoubleMetaPhone" from just "DoubleMetaPhone" because TYPO3 requires a user class to be prefixed so:
+// TYPO3: If you want to use this metaphone method instead of the default in the class.indexer.php you simply configure TYPO3 so by setting the line below in your localconf.php file:
+// TYPO3:                      $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
+// TYPO3: Of course you can write your own metaphone hook methods by taking this class and configuration as example.
+
+class user_DoubleMetaPhone
+{
+//  properties
+
+   var $original  = "";
+   var $primary   = "";
+   var $secondary = "";
+   var $length    =  0;
+   var $last      =  0;
+   var $current   =  0;
+
+//  methods
+
+               // TYPO3 specific API to this class. BEGIN
+       function metaphone($string)     {
+               $res = $this->DoubleMetaPhone($string);
+               #debug(array($string,$res['primary']));
+               return $res['primary'];
+       }
+               // TYPO3 specific API to this class. END
+
+
+  // Public method
+
+  function DoubleMetaPhone($string) {
+
+   $this->primary   = "";
+   $this->secondary = "";
+   $this->current   =  0;
+
+    $this->current  = 0;
+    $this->length   = strlen($string);
+    $this->last     = $this->length - 1;
+    $this->original = $string . "     ";
+
+    $this->original = strtoupper($this->original);
+
+    // skip this at beginning of word
+    if ($this->StringAt($this->original, 0, 2,
+                        array('GN', 'KN', 'PN', 'WR', 'PS')))
+      $this->current++;
+
+    // Initial 'X' is pronounced 'Z' e.g. 'Xavier'
+    if (substr($this->original, 0, 1) == 'X') {
+      $this->primary   .= "S";   // 'Z' maps to 'S'
+      $this->secondary .= "S";
+      $this->current++;
+    }
+
+    // main loop
+
+    while (strlen($this->primary) < 4 || strlen($this->secondary < 4)) {
+      if ($this->current >= $this->length)
+        break;
+
+      switch (substr($this->original, $this->current, 1)) {
+        case 'A':
+        case 'E':
+        case 'I':
+        case 'O':
+        case 'U':
+        case 'Y':
+          if ($this->current == 0) {
+            // all init vowels now map to 'A'
+            $this->primary   .= 'A';
+            $this->secondary .= 'A';
+          }
+          $this->current += 1;
+          break;
+
+        case 'B':
+          // '-mb', e.g. "dumb", already skipped over ...
+          $this->primary   .= 'P';
+          $this->secondary .= 'P';
+
+          if (substr($this->original, $this->current + 1, 1) == 'B')
+            $this->current += 2;
+          else
+            $this->current += 1;
+          break;
+
+        case 'Ç':
+          $this->primary   .= 'S';
+          $this->secondary .= 'S';
+          $this->current += 1;
+          break;
+
+        case 'C':
+          // various gremanic
+          if (($this->current > 1)
+              && !$this->IsVowel($this->original, $this->current - 2)
+              && $this->StringAt($this->original, $this->current - 1, 3,
+                        array("ACH"))
+              && ((substr($this->original, $this->current + 2, 1) != 'I')
+                  && ((substr($this->original, $this->current + 2, 1) != 'E')
+                      || $this->StringAt($this->original, $this->current - 2, 6,
+                                array("BACHER", "MACHER"))))) {
+
+            $this->primary   .= 'K';
+            $this->secondary .= 'K';
+            $this->current += 2;
+            break;
+          }
+
+          // special case 'caesar'
+          if (($this->current == 0)
+              && $this->StringAt($this->original, $this->current, 6,
+                         array("CAESAR"))) {
+            $this->primary   .= 'S';
+            $this->secondary .= 'S';
+            $this->current += 2;
+            break;
+          }
+
+          // italian 'chianti'
+          if ($this->StringAt($this->original, $this->current, 4,
+                         array("CHIA"))) {
+            $this->primary   .= 'K';
+            $this->secondary .= 'K';
+            $this->current += 2;
+            break;
+          }
+
+          if ($this->StringAt($this->original, $this->current, 2,
+                         array("CH"))) {
+
+            // find 'michael'
+            if (($this->current > 0)
+                && $this->StringAt($this->original, $this->current, 4,
+                         array("CHAE"))) {
+              $this->primary   .= 'K';
+              $this->secondary .= 'X';
+              $this->current += 2;
+              break;
+            }
+
+            // greek roots e.g. 'chemistry', 'chorus'
+            if (($this->current == 0)
+                && ($this->StringAt($this->original, $this->current + 1, 5,
+                         array("HARAC", "HARIS"))
+                    || $this->StringAt($this->original, $this->current + 1, 3,
+                              array("HOR", "HYM", "HIA", "HEM")))
+                && !$this->StringAt($this->original, 0, 5, array("CHORE"))) {
+              $this->primary   .= 'K';
+              $this->secondary .= 'K';
+              $this->current += 2;
+              break;
+            }
+
+            // germanic, greek, or otherwise 'ch' for 'kh' sound
+            if (($this->StringAt($this->original, 0, 4, array("VAN ", "VON "))
+                 || $this->StringAt($this->original, 0, 3, array("SCH")))
+                // 'architect' but not 'arch', orchestra', 'orchid'
+                || $this->StringAt($this->original, $this->current - 2, 6,
+                         array("ORCHES", "ARCHIT", "ORCHID"))
+                || $this->StringAt($this->original, $this->current + 2, 1,
+                         array("T", "S"))
+                || (($this->StringAt($this->original, $this->current - 1, 1,
+                         array("A","O","U","E"))
+                     || ($this->current == 0))
+                    // e.g. 'wachtler', 'weschsler', but not 'tichner'
+                    && $this->StringAt($this->original, $this->current + 2, 1,
+                         array("L","R","N","M","B","H","F","V","W"," ")))) {
+              $this->primary   .= 'K';
+              $this->secondary .= 'K';
+            } else {
+              if ($this->current > 0) {
+                if ($this->StringAt($this->original, 0, 2, array("MC"))) {
+                  // e.g. 'McHugh'
+                  $this->primary   .= 'K';
+                  $this->secondary .= 'K';
+                } else {
+                  $this->primary   .= 'X';
+                  $this->secondary .= 'K';
+                }
+              } else {
+                $this->primary   .= 'X';
+                $this->secondary .= 'X';
+              }
+            }
+            $this->current += 2;
+            break;
+          }
+
+          // e.g. 'czerny'
+          if ($this->StringAt($this->original, $this->current, 2, array("CZ"))
+              && !$this->StringAt($this->original, $this->current -2, 4,
+                         array("WICZ"))) {
+            $this->primary   .= 'S';
+            $this->secondary .= 'X';
+            $this->current += 2;
+            break;
+          }
+
+          // e.g. 'focaccia'
+          if ($this->StringAt($this->original, $this->current + 1, 3,
+                     array("CIA"))) {
+            $this->primary   .= 'X';
+            $this->secondary .= 'X';
+            $this->current += 3;
+            break;
+          }
+
+          // double 'C', but not McClellan'
+          if ($this->StringAt($this->original, $this->current, 2, array("CC"))
+              && !(($this->current == 1)
+                   && (substr($this->original, 0, 1) == 'M'))) {
+            // 'bellocchio' but not 'bacchus'
+            if ($this->StringAt($this->original, $this->current + 2, 1,
+                       array("I","E","H"))
+                && !$this->StringAt($this->original, $this->current + 2, 2,
+                          array("HU"))) {
+              // 'accident', 'accede', 'succeed'
+              if ((($this->current == 1)
+                   && (substr($this->original, $this->current - 1, 1) == 'A'))
+                  || $this->StringAt($this->original, $this->current - 1, 5,
+                            array("UCCEE", "UCCES"))) {
+                $this->primary   .= "KS";
+                $this->secondary .= "KS";
+                // 'bacci', 'bertucci', other italian
+              } else {
+                $this->primary   .= "X";
+                $this->secondary .= "X";
+              }
+              $this->current += 3;
+              break;
+            } else {
+              // Pierce's rule
+              $this->primary   .= "K";
+              $this->secondary .= "K";
+              $this->current += 2;
+              break;
+            }
+          }
+
+          if ($this->StringAt($this->original, $this->current, 2,
+                     array("CK","CG","CQ"))) {
+            $this->primary   .= "K";
+            $this->secondary .= "K";
+            $this->current += 2;
+            break;
+          }
+
+          if ($this->StringAt($this->original, $this->current, 2,
+                     array("CI","CE","CY"))) {
+            // italian vs. english
+            if ($this->StringAt($this->original, $this->current, 3,
+                       array("CIO","CIE","CIA"))) {
+              $this->primary   .= "S";
+              $this->secondary .= "X";
+            } else {
+              $this->primary   .= "S";
+              $this->secondary .= "S";
+            }
+            $this->current += 2;
+            break;
+          }
+
+          // else
+          $this->primary   .= "K";
+          $this->secondary .= "K";
+
+          // name sent in 'mac caffrey', 'mac gregor'
+          if ($this->StringAt($this->original, $this->current + 1, 2,
+                     array(" C"," Q"," G"))) {
+            $this->current += 3;
+          } else {
+            if ($this->StringAt($this->original, $this->current + 1, 1,
+                       array("C","K","Q"))
+                && !$this->StringAt($this->original, $this->current + 1, 2,
+                           array("CE","CI"))) {
+              $this->current += 2;
+            } else {
+              $this->current += 1;
+            }
+          }
+          break;
+
+        case 'D':
+          if ($this->StringAt($this->original, $this->current, 2,
+                     array("DG"))) {
+            if ($this->StringAt($this->original, $this->current + 2, 1,
+                       array("I","E","Y"))) {
+              // e.g. 'edge'
+              $this->primary   .= "J";
+              $this->secondary .= "J";
+              $this->current += 3;
+
+              break;
+            } else {
+              // e.g. 'edgar'
+              $this->primary   .= "TK";
+              $this->secondary .= "TK";
+              $this->current += 2;
+              break;
+            }
+          }
+
+          if ($this->StringAt($this->original, $this->current, 2,
+                     array("DT","DD"))) {
+            $this->primary   .= "T";
+            $this->secondary .= "T";
+            $this->current += 2;
+            break;
+          }
+
+          // else
+          $this->primary   .= "T";
+          $this->secondary .= "T";
+          $this->current += 1;
+          break;
+
+        case 'F':
+          if (substr($this->original, $this->current + 1, 1) == 'F')
+            $this->current += 2;
+          else
+            $this->current += 1;
+          $this->primary   .= "F";
+          $this->secondary .= "F";
+          break;
+
+        case 'G':
+          if (substr($this->original, $this->current + 1, 1) == 'H') {
+            if (($this->current > 0)
+                && !$this->IsVowel($this->original, $this->current - 1)) {
+              $this->primary   .= "K";
+              $this->secondary .= "K";
+              $this->current += 2;
+              break;
+            }
+
+            if ($this->current < 3) {
+              // 'ghislane', 'ghiradelli'
+              if ($this->current == 0) {
+                if (substr($this->original, $this->current + 2, 1) == 'I') {
+                  $this->primary   .= "J";
+                  $this->secondary .= "J";
+                } else {
+                  $this->primary   .= "K";
+                  $this->secondary .= "K";
+                }
+                $this->current += 2;
+                break;
+              }
+            }
+
+            // Parker's rule (with some further refinements) - e.g. 'hugh'
+            if ((($this->current > 1)
+                 && $this->StringAt($this->original, $this->current - 2, 1,
+                           array("B","H","D")))
+                // e.g. 'bough'
+                || (($this->current > 2)
+                    &&  $this->StringAt($this->original, $this->current - 3, 1,
+                               array("B","H","D")))
+                // e.g. 'broughton'
+                || (($this->current > 3)
+                    && $this->StringAt($this->original, $this->current - 4, 1,
+                               array("B","H")))) {
+              $this->current += 2;
+              break;
+            } else {
+              // e.g. 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
+              if (($this->current > 2)
+                  && (substr($this->original, $this->current - 1, 1) == 'U')
+                  && $this->StringAt($this->original, $this->current - 3, 1,
+                            array("C","G","L","R","T"))) {
+                $this->primary   .= "F";
+                $this->secondary .= "F";
+              } elseif (($this->current > 0)
+                        && substr($this->original, $this->current - 1, 1) != 'I') {
+                $this->primary   .= "K";
+                $this->secondary .= "K";
+              }
+              $this->current += 2;
+              break;
+            }
+          }
+
+          if (substr($this->original, $this->current + 1, 1) == 'N') {
+            if (($this->current == 1) && $this->IsVowel($this->original, 0)
+                && !$this->SlavoGermanic($this->original)) {
+              $this->primary   .= "KN";
+              $this->secondary .= "N";
+            } else {
+              // not e.g. 'cagney'
+              if (!$this->StringAt($this->original, $this->current + 2, 2,
+                          array("EY"))
+                  && (substr($this->original, $this->current + 1) != "Y")
+                  && !$this->SlavoGermanic($this->original)) {
+                 $this->primary   .= "N";
+                 $this->secondary .= "KN";
+              } else {
+                 $this->primary   .= "KN";
+                 $this->secondary .= "KN";
+              }
+            }
+            $this->current += 2;
+            break;
+          }
+
+          // 'tagliaro'
+          if ($this->StringAt($this->original, $this->current + 1, 2,
+                     array("LI"))
+              && !$this->SlavoGermanic($this->original)) {
+            $this->primary   .= "KL";
+            $this->secondary .= "L";
+            $this->current += 2;
+            break;
+          }
+
+          // -ges-, -gep-, -gel- at beginning
+          if (($this->current == 0)
+              && ((substr($this->original, $this->current + 1, 1) == 'Y')
+                  || $this->StringAt($this->original, $this->current + 1, 2,
+                            array("ES","EP","EB","EL","EY","IB","IL","IN","IE",
+                                  "EI","ER")))) {
+            $this->primary   .= "K";
+            $this->secondary .= "J";
+            $this->current += 2;
+            break;
+          }
+
+          // -ger-, -gy-
+          if (($this->StringAt($this->original, $this->current + 1, 2,
+                      array("ER"))
+               || (substr($this->original, $this->current + 1, 1) == 'Y'))
+              && !$this->StringAt($this->original, 0, 6,
+                         array("DANGER","RANGER","MANGER"))
+              && !$this->StringAt($this->original, $this->current -1, 1,
+                         array("E", "I"))
+              && !$this->StringAt($this->original, $this->current -1, 3,
+                         array("RGY","OGY"))) {
+            $this->primary   .= "K";
+            $this->secondary .= "J";
+            $this->current += 2;
+            break;
+          }
+
+          // italian e.g. 'biaggi'
+          if ($this->StringAt($this->original, $this->current + 1, 1,
+                     array("E","I","Y"))
+              || $this->StringAt($this->original, $this->current -1, 4,
+                        array("AGGI","OGGI"))) {
+            // obvious germanic
+            if (($this->StringAt($this->original, 0, 4, array("VAN ", "VON "))
+                 || $this->StringAt($this->original, 0, 3, array("SCH")))
+                || $this->StringAt($this->original, $this->current + 1, 2,
+                          array("ET"))) {
+              $this->primary   .= "K";
+              $this->secondary .= "K";
+            } else {
+              // always soft if french ending
+              if ($this->StringAt($this->original, $this->current + 1, 4,
+                         array("IER "))) {
+                $this->primary   .= "J";
+                $this->secondary .= "J";
+              } else {
+                $this->primary   .= "J";
+                $this->secondary .= "K";
+              }
+            }
+            $this->current += 2;
+            break;
+          }
+
+          if (substr($this->original, $this->current +1, 1) == 'G')
+            $this->current += 2;
+          else
+            $this->current += 1;
+
+          $this->primary   .= 'K';
+          $this->secondary .= 'K';
+          break;
+
+        case 'H':
+          // only keep if first & before vowel or btw. 2 vowels
+          if ((($this->current == 0) ||
+               $this->IsVowel($this->original, $this->current - 1))
+              && $this->IsVowel($this->original, $this->current + 1)) {
+            $this->primary   .= 'H';
+            $this->secondary .= 'H';
+            $this->current += 2;
+          } else
+            $this->current += 1;
+          break;
+
+        case 'J':
+          // obvious spanish, 'jose', 'san jacinto'
+          if ($this->StringAt($this->original, $this->current, 4,
+                     array("JOSE"))
+              || $this->StringAt($this->original, 0, 4, array("SAN "))) {
+            if ((($this->current == 0)
+                 && (substr($this->original, $this->current + 4, 1) == ' '))
+                || $this->StringAt($this->original, 0, 4, array("SAN "))) {
+              $this->primary   .= 'H';
+              $this->secondary .= 'H';
+            } else {
+              $this->primary   .= "J";
+              $this->secondary .= 'H';
+            }
+            $this->current += 1;
+            break;
+          }
+
+          if (($this->current == 0)
+              && !$this->StringAt($this->original, $this->current, 4,
+                     array("JOSE"))) {
+            $this->primary   .= 'J';  // Yankelovich/Jankelowicz
+            $this->secondary .= 'A';
+          } else {
+            // spanish pron. of .e.g. 'bajador'
+            if ($this->IsVowel($this->original, $this->current - 1)
+                && !$this->SlavoGermanic($this->original)
+                && ((substr($this->original, $this->current + 1, 1) == 'A')
+                    || (substr($this->original, $this->current + 1, 1) == 'O'))) {
+              $this->primary   .= "J";
+              $this->secondary .= "H";
+            } else {
+              if ($this->current == $this->last) {
+                $this->primary   .= "J";
+                $this->secondary .= "";
+              } else {
+                if (!$this->StringAt($this->original, $this->current + 1, 1,
+                            array("L","T","K","S","N","M","B","Z"))
+                    && !$this->StringAt($this->original, $this->current - 1, 1,
+                               array("S","K","L"))) {
+                  $this->primary   .= "J";
+                  $this->secondary .= "J";
+                }
+              }
+            }
+          }
+
+          if (substr($this->original, $this->current + 1, 1) == 'J') // it could happen
+            $this->current += 2;
+          else
+            $this->current += 1;
+          break;
+
+        case 'K':
+          if (substr($this->original, $this->current + 1, 1) == 'K')
+            $this->current += 2;
+          else
+            $this->current += 1;
+          $this->primary   .= "K";
+          $this->secondary .= "K";
+          break;
+
+        case 'L':
+          if (substr($this->original, $this->current + 1, 1) == 'L') {
+            // spanish e.g. 'cabrillo', 'gallegos'
+            if ((($this->current == ($this->length - 3))
+                 && $this->StringAt($this->original, $this->current - 1, 4,
+                           array("ILLO","ILLA","ALLE")))
+                || (($this->StringAt($this->original, $this->last-1, 2,
+                            array("AS","OS"))
+                  || $this->StringAt($this->original, $this->last, 1,
+                            array("A","O")))
+                 && $this->StringAt($this->original, $this->current - 1, 4,
+                           array("ALLE")))) {
+              $this->primary   .= "L";
+              $this->secondary .= "";
+              $this->current += 2;
+              break;
+            }
+            $this->current += 2;
+          } else
+            $this->current += 1;
+          $this->primary   .= "L";
+          $this->secondary .= "L";
+          break;
+
+        case 'M':
+          if (($this->StringAt($this->original, $this->current - 1, 3,
+                     array("UMB"))
+               && ((($this->current + 1) == $this->last)
+                   || $this->StringAt($this->original, $this->current + 2, 2,
+                            array("ER"))))
+              // 'dumb', 'thumb'
+              || (substr($this->original, $this->current + 1, 1) == 'M')) {
+              $this->current += 2;
+          } else {
+              $this->current += 1;
+          }
+          $this->primary   .= "M";
+          $this->secondary .= "M";
+          break;
+
+        case 'N':
+          if (substr($this->original, $this->current + 1, 1) == 'N')
+            $this->current += 2;
+          else
+            $this->current += 1;
+          $this->primary   .= "N";
+          $this->secondary .= "N";
+          break;
+
+        case 'Ñ':
+          $this->current += 1;
+          $this->primary   .= "N";
+          $this->secondary .= "N";
+          break;
+
+        case 'P':
+          if (substr($this->original, $this->current + 1, 1) == 'H') {
+            $this->current += 2;
+            $this->primary   .= "F";
+            $this->secondary .= "F";
+            break;
+          }
+
+          // also account for "campbell" and "raspberry"
+          if ($this->StringAt($this->original, $this->current + 1, 1,
+                     array("P","B")))
+            $this->current += 2;
+          else
+            $this->current += 1;
+          $this->primary   .= "P";
+          $this->secondary .= "P";
+          break;
+
+        case 'Q':
+          if (substr($this->original, $this->current + 1, 1) == 'Q')
+            $this->current += 2;
+          else
+            $this->current += 1;
+          $this->primary   .= "K";
+          $this->secondary .= "K";
+          break;
+
+        case 'R':
+          // french e.g. 'rogier', but exclude 'hochmeier'
+          if (($this->current == $this->last)
+              && !$this->SlavoGermanic($this->original)
+              && $this->StringAt($this->original, $this->current - 2, 2,
+                        array("IE"))
+              && !$this->StringAt($this->original, $this->current - 4, 2,
+                         array("ME","MA"))) {
+            $this->primary   .= "";
+            $this->secondary .= "R";
+          } else {
+            $this->primary   .= "R";
+            $this->secondary .= "R";
+          }
+          if (substr($this->original, $this->current + 1, 1) == 'R')
+            $this->current += 2;
+          else
+            $this->current += 1;
+          break;
+
+        case 'S':
+          // special cases 'island', 'isle', 'carlisle', 'carlysle'
+          if ($this->StringAt($this->original, $this->current - 1, 3,
+                     array("ISL","YSL"))) {
+            $this->current += 1;
+            break;
+          }
+
+          // special case 'sugar-'
+          if (($this->current == 0)
+              && $this->StringAt($this->original, $this->current, 5,
+                        array("SUGAR"))) {
+            $this->primary   .= "X";
+            $this->secondary .= "S";
+            $this->current += 1;
+            break;
+          }
+
+          if ($this->StringAt($this->original, $this->current, 2,
+                     array("SH"))) {
+            // germanic
+            if ($this->StringAt($this->original, $this->current + 1, 4,
+                       array("HEIM","HOEK","HOLM","HOLZ"))) {
+              $this->primary   .= "S";
+              $this->secondary .= "S";
+            } else {
+              $this->primary   .= "X";
+              $this->secondary .= "X";
+            }
+            $this->current += 2;
+            break;
+          }
+
+          // italian & armenian
+          if ($this->StringAt($this->original, $this->current, 3,
+                     array("SIO","SIA"))
+              || $this->StringAt($this->original, $this->current, 4,
+                        array("SIAN"))) {
+            if (!$this->SlavoGermanic($this->original)) {
+              $this->primary   .= "S";
+              $this->secondary .= "X";
+            } else {
+              $this->primary   .= "S";
+              $this->secondary .= "S";
+            }
+            $this->current += 3;
+            break;
+          }
+
+          // german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
+          // also, -sz- in slavic language altho in hungarian it is pronounced 's'
+          if ((($this->current == 0)
+               && $this->StringAt($this->original, $this->current + 1, 1,
+                         array("M","N","L","W")))
+              || $this->StringAt($this->original, $this->current + 1, 1,
+                        array("Z"))) {
+            $this->primary   .= "S";
+            $this->secondary .= "X";
+            if ($this->StringAt($this->original, $this->current + 1, 1,
+                        array("Z")))
+              $this->current += 2;
+            else
+              $this->current += 1;
+            break;
+          }
+
+          if ($this->StringAt($this->original, $this->current, 2,
+                     array("SC"))) {
+            // Schlesinger's rule
+            if (substr($this->original, $this->current + 2, 1) == 'H')
+              // dutch origin, e.g. 'school', 'schooner'
+              if ($this->StringAt($this->original, $this->current + 3, 2,
+                         array("OO","ER","EN","UY","ED","EM"))) {
+                // 'schermerhorn', 'schenker'
+                if ($this->StringAt($this->original, $this->current + 3, 2,
+                           array("ER","EN"))) {
+                  $this->primary   .= "X";
+                  $this->secondary .= "SK";
+                } else {
+                  $this->primary   .= "SK";
+                  $this->secondary .= "SK";
+                }
+                $this->current += 3;
+                break;
+              } else {
+                if (($this->current == 0)
+                    && !$this->IsVowel($this->original, 3)
+                    && (substr($this->original, $this->current + 3, 1) != 'W')) {
+                  $this->primary   .= "X";
+                  $this->secondary .= "S";
+                } else {
+                  $this->primary   .= "X";
+                  $this->secondary .= "X";
+                }
+                $this->current += 3;
+                break;
+              }
+
+              if ($this->StringAt($this->original, $this->current + 2, 1,
+                         array("I","E","Y"))) {
+                $this->primary   .= "S";
+                $this->secondary .= "S";
+                $this->current += 3;
+                break;
+              }
+
+            // else
+            $this->primary   .= "SK";
+            $this->secondary .= "SK";
+            $this->current += 3;
+            break;
+          }
+
+          // french e.g. 'resnais', 'artois'
+          if (($this->current == $this->last)
+              && $this->StringAt($this->original, $this->current - 2, 2,
+                        array("AI","OI"))) {
+            $this->primary   .= "";
+            $this->secondary .= "S";
+          } else {
+            $this->primary   .= "S";
+            $this->secondary .= "S";
+          }
+
+          if ($this->StringAt($this->original, $this->current + 1, 1,
+                     array("S","Z")))
+            $this->current += 2;
+          else
+            $this->current += 1;
+          break;
+
+        case 'T':
+          if ($this->StringAt($this->original, $this->current, 4,
+                     array("TION"))) {
+            $this->primary   .= "X";
+            $this->secondary .= "X";
+            $this->current += 3;
+            break;
+          }
+
+          if ($this->StringAt($this->original, $this->current, 3,
+                     array("TIA","TCH"))) {
+            $this->primary   .= "X";
+            $this->secondary .= "X";
+            $this->current += 3;
+            break;
+          }
+
+          if ($this->StringAt($this->original, $this->current, 2,
+                     array("TH"))
+              || $this->StringAt($this->original, $this->current, 3,
+                            array("TTH"))) {
+            // special case 'thomas', 'thames' or germanic
+            if ($this->StringAt($this->original, $this->current + 2, 2,
+                       array("OM","AM"))
+                || $this->StringAt($this->original, 0, 4, array("VAN ","VON "))
+                || $this->StringAt($this->original, 0, 3, array("SCH"))) {
+              $this->primary   .= "T";
+              $this->secondary .= "T";
+            } else {
+              $this->primary   .= "0";
+              $this->secondary .= "T";
+            }
+            $this->current += 2;
+            break;
+          }
+
+          if ($this->StringAt($this->original, $this->current + 1, 1,
+                     array("T","D")))
+            $this->current += 2;
+          else
+            $this->current += 1;
+          $this->primary   .= "T";
+          $this->secondary .= "T";
+          break;
+
+        case 'V':
+          if (substr($this->original, $this->current + 1, 1) == 'V')
+            $this->current += 2;
+          else
+            $this->current += 1;
+          $this->primary   .= "F";
+          $this->secondary .= "F";
+          break;
+
+        case 'W':
+          // can also be in middle of word
+          if ($this->StringAt($this->original, $this->current, 2, array("WR"))) {
+            $this->primary   .= "R";
+            $this->secondary .= "R";
+            $this->current += 2;
+            break;
+          }
+
+          if (($this->current == 0)
+              && ($this->IsVowel($this->original, $this->current + 1)
+                  || $this->StringAt($this->original, $this->current, 2,
+                            array("WH")))) {
+            // Wasserman should match Vasserman
+            if ($this->IsVowel($this->original, $this->current + 1)) {
+              $this->primary   .= "A";
+              $this->secondary .= "F";
+            } else {
+              // need Uomo to match Womo
+              $this->primary   .= "A";
+              $this->secondary .= "A";
+            }
+          }
+
+          // Arnow should match Arnoff
+          if ((($this->current == $this->last)
+                && $this->IsVowel($this->original, $this->current - 1))
+              || $this->StringAt($this->original, $this->current - 1, 5,
+                        array("EWSKI","EWSKY","OWSKI","OWSKY"))
+              || $this->StringAt($this->original, 0, 3, array("SCH"))) {
+            $this->primary   .= "";
+            $this->secondary .= "F";
+            $this->current += 1;
+            break;
+          }
+
+          // polish e.g. 'filipowicz'
+          if ($this->StringAt($this->original, $this->current, 4,
+                     array("WICZ","WITZ"))) {
+            $this->primary   .= "TS";
+            $this->secondary .= "FX";
+            $this->current += 4;
+            break;
+          }
+
+          // else skip it
+          $this->current += 1;
+          break;
+
+        case 'X':
+          // french e.g. breaux
+          if (!(($this->current == $this->last)
+                && ($this->StringAt($this->original, $this->current - 3, 3,
+                           array("IAU", "EAU"))
+                 || $this->StringAt($this->original, $this->current - 2, 2,
+                           array("AU", "OU"))))) {
+            $this->primary   .= "KS";
+            $this->secondary .= "KS";
+          }
+
+          if ($this->StringAt($this->original, $this->current + 1, 1,
+                     array("C","X")))
+            $this->current += 2;
+          else
+            $this->current += 1;
+          break;
+
+        case 'Z':
+          // chinese pinyin e.g. 'zhao'
+          if (substr($this->original, $this->current + 1, 1) == "H") {
+            $this->primary   .= "J";
+            $this->secondary .= "J";
+            $this->current += 2;
+            break;
+          } elseif ($this->StringAt($this->original, $this->current + 1, 2,
+                           array("ZO", "ZI", "ZA"))
+                    || ($this->SlavoGermanic($this->original)
+                        && (($this->current > 0)
+                            && substr($this->original, $this->current - 1, 1) != 'T'))) {
+            $this->primary   .= "S";
+            $this->secondary .= "TS";
+          } else {
+            $this->primary   .= "S";
+            $this->secondary .= "S";
+          }
+
+          if (substr($this->original, $this->current + 1, 1) == 'Z')
+            $this->current += 2;
+          else
+            $this->current += 1;
+          break;
+
+        default:
+          $this->current += 1;
+
+      } // end switch
+
+    // printf("<br>ORIGINAL:    '%s'\n", $this->original);
+    // printf("<br>current:    '%s'\n", $this->current);
+    // printf("<br>  PRIMARY:   '%s'\n", $this->primary);
+    // printf("<br>  SECONDARY: '%s'\n", $this->secondary);
+
+    } // end while
+
+    $this->primary   = substr($this->primary,   0, 4);
+    $this->secondary = substr($this->secondary, 0, 4);
+
+    $result["primary"] = $this->primary ;
+    $result["secondary"] = $this->secondary ;
+
+    return $result ;
+
+  } // end of function MetaPhone
+
+
+  // Private methods
+
+  function StringAt($string, $start, $length, $list) {
+    if (($start <0) || ($start >= strlen($string)))
+      return 0;
+
+    for ($i=0; $i<count($list); $i++) {
+      if ($list[$i] == substr($string, $start, $length))
+        return 1;
+    }
+    return 0;
+  }
+
+  function IsVowel($string, $pos) {
+    return ereg("[AEIOUY]", substr($string, $pos, 1));
+  }
+
+  function SlavoGermanic($string) {
+    return ereg("W|K|CZ|WITZ", $string);
+  }
+} // end of class MetaPhone
+?>
\ No newline at end of file
diff --git a/typo3/sysext/indexed_search/class.external_parser.php b/typo3/sysext/indexed_search/class.external_parser.php
new file mode 100755 (executable)
index 0000000..7bcb686
--- /dev/null
@@ -0,0 +1,536 @@
+<?php
+/***************************************************************
+*  Copyright notice
+*
+*  (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
+*  All rights reserved
+*
+*  This script is part of the TYPO3 project. The TYPO3 project is
+*  free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2 of the License, or
+*  (at your option) any later version.
+*
+*  The GNU General Public License can be found at
+*  http://www.gnu.org/copyleft/gpl.html.
+*  A copy is found in the textfile GPL.txt and important notices to the license
+*  from the author is found in LICENSE.txt distributed with these scripts.
+*
+*
+*  This script is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*
+*  This copyright notice MUST APPEAR in all copies of the script!
+***************************************************************/
+/**
+ * External standard parsers for indexed_search
+ *
+ * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
+ * @coauthor   Olivier Simah <noname_paris@yahoo.fr>
+ */
+/**
+ * [CLASS/FUNCTION INDEX of SCRIPT]
+ *
+ *
+ *
+ *   73: class tx_indexed_search_extparse
+ *   90:     function initParser($extension)
+ *  215:     function initBackend($extension)
+ *
+ *              SECTION: Reading documents (for parsing)
+ *  261:     function readFileContent($ext,$absFile,$cPKey)
+ *  441:     function fileContentParts($ext,$absFile)
+ *  480:     function splitPdfInfo($pdfInfoArray)
+ *  499:     function removeEndJunk($string)
+ *
+ *              SECTION: Backend analyzer
+ *  526:     function getIcon($extension)
+ *
+ * TOTAL FUNCTIONS: 7
+ * (This index is automatically created/updated by the extension "extdeveval")
+ *
+ */
+
+
+
+
+
+
+
+
+
+
+/**
+ * External standard parsers for indexed_search
+ * MUST RETURN utf-8 content!
+ *
+ * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
+ * @package TYPO3
+ * @subpackage tx_indexedsearch
+ */
+class tx_indexed_search_extparse {
+
+               // This value is also overridden from config.
+       var $pdf_mode = -20;    // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
+
+               // This array is configured in initialization:
+       var $app = array();
+
+       var $pObj;              // Reference to parent object (indexer class)
+
+
+       /**
+        * Initialize external parser for parsing content.
+        *
+        * @param       string          File extension
+        * @return      boolean         Returns true if extension is supported/enabled, otherwise false.
+        */
+       function initParser($extension) {
+
+                       // Then read indexer-config and set if appropriate:
+               $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
+
+                       // If windows, apply extension to tool name:
+               $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
+               $extOK = FALSE;
+
+                       // Ignore extensions
+               $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
+               if (in_array($extension, $ignoreExtensions))    {
+                       $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);
+
+                       return FALSE;
+               }
+
+                       // Switch on file extension:
+               switch($extension)      {
+                       case 'pdf':
+                                       // PDF
+                               if ($indexerConfig['pdftools']) {
+                                       $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
+                                       if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
+                                               $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
+                                               $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
+                                                       // PDF mode:
+                                               $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
+                                               $extOK = TRUE;
+                                       } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
+                               } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
+                       break;
+                       case 'doc':
+                                       // Catdoc
+                               if ($indexerConfig['catdoc'])   {
+                                       $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
+                                       if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe))        {
+                                               $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
+                                               $extOK = TRUE;
+                                       } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3);
+                               } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
+                       break;
+                       case 'pps':             // MS PowerPoint(?)
+                       case 'ppt':             // MS PowerPoint
+                                       // ppthtml
+                               if ($indexerConfig['ppthtml'])  {
+                                       $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
+                                       if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
+                                               $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
+                                               $extOK = TRUE;
+                                       } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in paths '".$ppthtmlPath."ppthtml'",3);
+                               } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
+                       break;
+                       case 'xls':             // MS Excel
+                                       // Xlhtml
+                               if ($indexerConfig['xlhtml'])   {
+                                       $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
+                                       if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
+                                               $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
+                                               $extOK = TRUE;
+                                       } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in paths '".$xlhtmlPath."xlhtml'",3);
+                               } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);
+                       break;
+                       case 'sxc':             // Open Office Calc.
+                       case 'sxi':             // Open Office Impress
+                       case 'sxw':             // Open Office Writer
+                                       // ooo_extract.rb can be found at: http://www.math.umd.edu/~dcarrera/openoffice/misc/tools/ooo_extract.html
+                                       // I had to run this on debian before I could run the ooo_extract.rb script:
+                                       //              apt-get install libzlib-ruby1.8
+                                       //              apt-get install librexml-ruby1.8
+                                       // ruby + ooo_extract
+                               if ($indexerConfig['nativeOOMethod'])   {
+                                       if (t3lib_extMgm::isLoaded('libunzipped'))      {
+                                               $this->app['nativeOOMethod'] = TRUE;
+                                               $extOK = TRUE;
+                                               $this->pObj->log_setTSlogMessage('Using "libunzipped" for extraction of Open Office files, "'.$extension.'".',1);
+                                       } else $this->pObj->log_setTSlogMessage('The extension "libunzipped" was not loaded (for extraction of Open Office files, "'.$extension.'")',2);
+                               } else {
+                                       if ($indexerConfig['ruby'])     {
+                                               $rubyPath = ereg_replace('\/$','',$indexerConfig['ruby']).'/';
+                                               $oooExPath = ereg_replace('\/$','',$indexerConfig['OOoExtract']).'/';
+                                               if (ini_get('safe_mode') || (@is_file($rubyPath.'ruby'.$exe) && @is_file($oooExPath.'ooo_extract.rb'))){
+                                                       $this->app['ruby'] = $rubyPath.'ruby'.$exe;
+                                                       $this->app['OOo'] = $oooExPath.'ooo_extract.rb';
+                                                       $extOK = TRUE;
+                                               } else $this->pObj->log_setTSlogMessage("'Ruby and OOo_extract' tools for reading OOo documents were not found in paths '".$rubyPath."ruby' OR '".$oooExPath."ooo_extract.rb'",3);
+                                       } else $this->pObj->log_setTSlogMessage('Ruby & OOo_extract tools (OpenOffice-files) disabled',1);
+                               }
+                       break;
+                       case 'rtf':
+                                       // Catdoc
+                               if ($indexerConfig['unrtf'])    {
+                                       $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
+                                       if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe))  {
+                                               $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
+                                               $extOK = TRUE;
+                                       } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in paths '".$unrtfPath."unrtf'",3);
+                               } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
+                       break;
+                       case 'txt':             // Raw text
+                       case 'html':    // PHP strip-tags()
+                       case 'htm':             // PHP strip-tags()
+                       case 'csv':             // Raw text
+                       case 'xml':             // PHP strip-tags()
+                       case 'jpg':             // PHP EXIF
+                       case 'jpeg':    // PHP EXIF
+                       case 'tif':             // PHP EXIF
+                               $extOK = TRUE;
+                       break;
+               }
+
+                       // If extension was OK:
+               if ($extOK)     {
+                       $this->supportedExtensions[$extension] = TRUE;
+                       return TRUE;
+               }
+       }
+
+       /**
+        * Initialize external parser for backend modules
+        * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc).
+        *
+        * @param       string          File extension to initialize for.
+        * @return      boolean         Returns true if the extension is supported and enabled, otherwise false.
+        */
+       function initBackend($extension)        {
+               switch($extension)      {
+                       case 'pdf':             // PDF
+                       case 'doc':             // MS Word files
+                       case 'pps':             // MS PowerPoint
+                       case 'ppt':             // MS PowerPoint
+                       case 'xls':             // MS Excel
+                       case 'sxc':             // Open Office Calc.
+                       case 'sxi':             // Open Office Impress
+                       case 'sxw':             // Open Office Writer
+                       case 'rtf':             // RTF documents
+                       case 'txt':             // ASCII Text documents
+                       case 'html':    // HTML
+                       case 'htm':             // HTML
+                       case 'csv':             // Comma Separated Values
+                       case 'xml':             // Generic XML
+                       case 'jpg':             // Jpeg images (EXIF comment)
+                       case 'jpeg':    // Jpeg images (EXIF comment)
+                       case 'tif':             // TIf images (EXIT comment)
+                               return TRUE;
+                       break;
+               }
+       }
+
+
+
+
+
+
+
+
+
+       /************************
+        *
+        * Reading documents (for parsing)
+        *
+        ************************/
+
+       /**
+        * Reads the content of an external file being indexed.
+        *
+        * @param       string          File extension, eg. "pdf", "doc" etc.
+        * @param       string          Absolute filename of file (must exist and be validated OK before calling function)
+        * @param       string          Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
+        * @return      array           Standard content array (title, description, keywords, body keys)
+        */
+       function readFileContent($ext,$absFile,$cPKey)  {
+               unset($contentArr);
+
+                       // Return immediately if initialization didn't set support up:
+               if (!$this->supportedExtensions[$ext])  return FALSE;
+
+                       // Switch by file extension
+               switch ($ext)   {
+                       case 'pdf':
+                               if ($this->app['pdfinfo'])      {
+                                               // Getting pdf-info:
+                                       $cmd = $this->app['pdfinfo'].' '.$absFile;
+                                       exec($cmd,$res);
+                                       $pdfInfo = $this->splitPdfInfo($res);
+                                       if (intval($pdfInfo['pages']))  {
+                                               list($low,$high) = explode('-',$cPKey);
+
+                                                       // Get pdf content:
+                                               $tempFileName = t3lib_div::tempnam('Typo3_indexer');            // Create temporary name
+                                               @unlink ($tempFileName);        // Delete if exists, just to be safe.
+                                               $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q '.$absFile.' '.$tempFileName;
+                                               exec($cmd,$res);
+                                               if (@is_file($tempFileName))    {
+                                                       $content = t3lib_div::getUrl($tempFileName);
+                                                       unlink($tempFileName);
+                                               } else {
+                                                       $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
+                                               }
+                                               $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
+                                       }
+                               }
+                       break;
+                       case 'doc':
+                               if ($this->app['catdoc'])       {
+                                       $cmd = $this->app['catdoc'].' -d utf-8 '.$absFile;
+                                       exec($cmd,$res);
+                                       $content = implode(chr(10),$res);
+                                       $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
+                               }
+                       break;
+                       case 'pps':
+                       case 'ppt':
+                               if ($this->app['ppthtml'])      {
+                                       $cmd = $this->app['ppthtml'].' '.$absFile;
+                                       exec($cmd,$res);
+                                       $content = implode(chr(10),$res);
+                                       $content = $this->pObj->convertHTMLToUtf8($content);
+                                       $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
+                                       $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
+                               }
+                       break;
+                       case 'xls':
+                               if ($this->app['xlhtml'])       {
+                                       $cmd = $this->app['xlhtml'].' -nc -te '.$absFile;
+                                       exec($cmd,$res);
+                                       $content = implode(chr(10),$res);
+                                       $content = $this->pObj->convertHTMLToUtf8($content);
+                                       $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
+                                       $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
+                               }
+                       break;
+                       case 'sxi':
+                       case 'sxc':
+                       case 'sxw':
+                               if ($this->app['nativeOOMethod'])       {
+                                       if (t3lib_extMgm::isLoaded('libunzipped'))      {
+
+                                               global $TYPO3_CONF_VARS;
+                                               require_once(t3lib_extMgm::extPath('libunzipped').'class.tx_libunzipped.php');
+
+                                                       // Initialize Unzip object:
+                                               $unzip = t3lib_div::makeInstance('tx_libunzipped');
+                                               $ooFiles = $unzip->init($absFile);
+                                               if (is_array($ooFiles)) {
+                                                               // Read content.xml:
+                                                       $content_xml = $unzip->getFileFromArchive('content.xml');
+                                                       $meta_xml = $unzip->getFileFromArchive('meta.xml');
+                                                       $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml['content'])));
+                                                       $contentArr = $this->pObj->splitRegularContent($utf8_content);
+                                                       $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
+
+                                                               // Meta information
+                                                       $metaContent = t3lib_div::xml2tree($meta_xml['content']);
+                                                       $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
+                                                       if (is_array($metaContent))     {
+                                                               $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
+                                                               $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
+
+                                                                       // Keywords collected:
+                                                               if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword']))   {
+                                                                       foreach($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat)       {
+                                                                               $contentArr['keywords'].= $kwDat['values'][0].' ';
+                                                                       }
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               } else {
+                                       if ($this->app['ruby']) {
+                                                       // Extracting document headers:
+                                               $cmd = $this->app['ruby'].' '.$this->app['OOo'].' --heading '.$absFile;
+                                               exec($cmd,$headings);
+
+                                                       // Extracting document text:
+                                               $cmd = $this->app['ruby'].' '.$this->app['OOo'].' '.$absFile;
+                                               exec($cmd,$texts);
+
+                                               $content = implode(chr(10),$headings).' '.implode(chr(10),$texts);
+                                               $contentArr = $this->pObj->splitRegularContent($content);
+                                               $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
+                                       }
+                               }
+                       break;
+                       case 'rtf':
+                               if ($this->app['unrtf'])        {
+                                       $cmd = $this->app['unrtf'].' '.$absFile;
+                                       exec($cmd,$res);
+                                       $fileContent = implode(chr(10),$res);
+                                       $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
+                                       $contentArr = $this->pObj->splitHTMLContent($fileContent);
+                               }
+                       break;
+                       case 'txt':
+                       case 'csv':             // Raw text
+                               $content = t3lib_div::getUrl($absFile);
+                                       // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
+                               $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
+                               $contentArr = $this->pObj->splitRegularContent($content);
+                               $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
+                       break;
+                       case 'html':
+                       case 'htm':
+                               $fileContent = t3lib_div::getUrl($absFile);
+                               $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
+                               $contentArr = $this->pObj->splitHTMLContent($fileContent);
+                       break;
+                       case 'xml':             // PHP strip-tags()
+                               $fileContent = t3lib_div::getUrl($absFile);
+
+                                       // Finding charset:
+                               eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
+                               $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
+
+                                       // Converting content:
+                               $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
+                               $contentArr = $this->pObj->splitRegularContent($fileContent);
+                               $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
+                       break;
+                       case 'jpg':             // PHP EXIF
+                       case 'jpeg':    // PHP EXIF
+                       case 'tif':             // PHP EXIF
+                               $exif = exif_read_data($absFile, 'IFD0');
+                               if ($exif)      {
+                                       $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']);     // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
+                               } else {
+                                       $comment = '';
+                               }
+                               $contentArr = $this->pObj->splitRegularContent($comment);
+                               $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
+                       break;
+                       default:
+                               return false;
+                       break;
+               }
+                       // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
+               if (is_array($contentArr) && !$contentArr['title'])     {
+                       $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
+               }
+
+               return $contentArr;
+       }
+
+       /**
+        * Creates an array with pointers to divisions of document.
+        * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
+        *
+        * @param       string          File extension
+        * @param       string          Absolute filename (must exist and be validated OK before calling function)
+        * @return      array           Array of pointers to sections that the document should be divided into
+        */
+       function fileContentParts($ext,$absFile)        {
+               $cParts = array(0);
+               switch ($ext)   {
+                       case 'pdf':
+                                       // Getting pdf-info:
+                               $cmd = $this->app['pdfinfo'].' '.$absFile;
+                               exec($cmd,$res);
+                               $pdfInfo = $this->splitPdfInfo($res);
+
+                               if (intval($pdfInfo['pages']))  {
+                                       $cParts = array();
+
+                                               // Calculate mode
+                                       if ($this->pdf_mode>0)  {
+                                               $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
+                                       } else {
+                                               $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
+                                       }
+
+                                               // Traverse and create intervals.
+                                       for ($a=0;$a<$iter;$a++)        {
+                                               $low = floor($a*($pdfInfo['pages']/$iter))+1;
+                                               $high = floor(($a+1)*($pdfInfo['pages']/$iter));
+                                               $cParts[] = $low.'-'.$high;
+                                       }
+                               }
+                       break;
+               }
+               return $cParts;
+       }
+
+       /**
+        * Analysing PDF info into a useable format.
+        *
+        * @param       array           Array of PDF content, coming from the pdfinfo tool
+        * @return      array           Result array
+        * @access private
+        * @see fileContentParts()
+        */
+       function splitPdfInfo($pdfInfoArray)    {
+               $res = array();
+               if (is_array($pdfInfoArray))    {
+                       foreach($pdfInfoArray as $line) {
+                               $parts = explode(':',$line,2);
+                               if (count($parts)>1 && trim($parts[0])) {
+                                       $res[strtolower(trim($parts[0]))] = trim($parts[1]);
+                               }
+                       }
+               }
+               return $res;
+       }
+
+       /**
+        * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
+        *
+        * @param       string          String to clean up
+        * @return      string          String
+        */
+       function removeEndJunk($string) {
+               return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
+       }
+
+
+
+
+
+
+
+
+
+
+
+
+       /************************
+        *
+        * Backend analyzer
+        *
+        ************************/
+
+       /**
+        * Return icon for file extension
+        *
+        * @param       string          File extension, lowercase.
+        * @return      string          Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
+        */
+       function getIcon($extension)    {
+               if ($extension=='htm')  $extension = 'html';
+               if ($extension=='jpeg') $extension = 'jpg';
+               return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
+       }
+}
+
+if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])    {
+    include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
+}
+?>
\ No newline at end of file
index a6fd757..24d0740 100755 (executable)
  * This class is a search indexer for TYPO3
  *
  * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
- * @coauthor   Christian Jul Jensen <christian@typo3.com>
+ * Originally Christian Jul Jensen <christian@jul.net> helped as well.
  */
 /**
  * [CLASS/FUNCTION INDEX of SCRIPT]
  *
  *
  *
- *  118: class tx_indexedsearch_indexer
- *  200:     function hook_indexContent(&$pObj)
+ *  135: class tx_indexedsearch_indexer
+ *  198:     function hook_indexContent(&$pObj)
+ *
+ *              SECTION: Backend API
+ *  283:     function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
+ *  320:     function backend_setFreeIndexUid($freeIndexUid)
+ *  337:     function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
  *
  *              SECTION: Initialization
- *  242:     function init()
- *  271:     function initExternalReaders()
+ *  388:     function init()
+ *  439:     function initializeExternalParsers()
  *
- *              SECTION: Indexing
- *  325:     function indexTypo3PageContent()
- *  400:     function splitHTMLContent($content)
- *  446:     function splitRegularContent($content)
- *  459:     function procesWordsInArrays($contentArr)
- *  482:     function bodyDescription($contentArr)
- *  499:     function extractLinks($content)
- *  531:     function getJumpurl($query)
- *  544:     function splitPdfInfo($pdfInfoArray)
- *  564:     function indexRegularDocument($file)
- *  647:     function readFileContent($ext,$absFile,$cPKey)
- *  711:     function fileContentParts($ext,$absFile)
- *  754:     function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
- *  780:     function indexAnalyze($content)
- *  801:     function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
- *  820:     function analyzeBody(&$retArr,$content)
- *  840:     function typoSearchTags(&$body)
+ *              SECTION: Indexing; TYPO3 pages (HTML content)
+ *  480:     function indexTypo3PageContent()
+ *  564:     function splitHTMLContent($content)
+ *  610:     function getHTMLcharset($content)
+ *  625:     function convertHTMLToUtf8($content,$charset='')
+ *  653:     function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
+ *  680:     function typoSearchTags(&$body)
+ *  709:     function extractLinks($content)
+ *  752:     function extractHyperLinks($string)
  *
- *              SECTION: Words
- *  891:     function split2words(&$string)
- *  924:     function wordOK($w)
- *  942:     function metaphone($word)
- *  954:     function strtolower_all($str)
+ *              SECTION: Indexing; external URL
+ *  804:     function indexExternalUrl($externalUrl)
+ *  835:     function getUrlHeaders($url, $timeout = 2)
  *
- *              SECTION: SQL Helper functions
- *  985:     function freqMap($freq)
- * 1003:     function getRootLineFields(&$fieldArr)
+ *              SECTION: Indexing; external files (PDF, DOC, etc)
+ *  895:     function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
+ * 1001:     function readFileContent($ext,$absFile,$cPKey)
+ * 1018:     function fileContentParts($ext,$absFile)
+ * 1036:     function splitRegularContent($content)
+ *
+ *              SECTION: Analysing content, Extracting words
+ * 1069:     function charsetEntity2utf8(&$contentArr, $charset)
+ * 1091:     function procesWordsInArrays($contentArr)
+ * 1114:     function bodyDescription($contentArr)
+ * 1135:     function indexAnalyze($content)
+ * 1156:     function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
+ * 1175:     function analyzeBody(&$retArr,$content)
+ * 1195:     function metaphone($word,$retRaw=FALSE)
+ *
+ *              SECTION: SQL; TYPO3 Pages
+ * 1237:     function submitPage()
+ * 1306:     function submit_grlist($hash,$phash_x)
+ * 1326:     function submit_section($hash,$hash_t3)
+ * 1344:     function removeOldIndexedPages($phash)
+ *
+ *              SECTION: SQL; External media
+ * 1387:     function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
+ * 1445:     function submitFile_grlist($hash)
+ * 1459:     function submitFile_section($hash)
+ * 1473:     function removeOldIndexedFiles($phash)
  *
  *              SECTION: SQL Helper functions
- * 1043:     function removeIndexedPhashRow($phashList,$clearPageCache=1)
- * 1083:     function checkMtimeTstamp($mtime,$maxAge,$minAge,$phash)
- * 1117:     function update_grlist($phash,$phash_x)
- * 1129:     function is_grlist_set($phash_x)
- * 1140:     function checkContentHash()
- * 1154:     function removeLoginpagesWithContentHash()
- * 1172:     function removeOldIndexedPages($phash)
- * 1190:     function checkExternalDocContentHash($hashGr,$content_md5h)
- * 1205:     function updateTstamp($phash,$mtime=0)
- * 1221:     function updateParsetime($phash,$parsetime)
- * 1234:     function updateRootline()
+ * 1509:     function checkMtimeTstamp($mtime,$phash)
+ * 1545:     function checkContentHash()
+ * 1562:     function checkExternalDocContentHash($hashGr,$content_md5h)
+ * 1576:     function is_grlist_set($phash_x)
+ * 1589:     function update_grlist($phash,$phash_x)
+ * 1604:     function updateTstamp($phash,$mtime=0)
+ * 1620:     function updateParsetime($phash,$parsetime)
+ * 1633:     function updateRootline()
+ * 1648:     function getRootLineFields(&$fieldArr)
+ * 1667:     function removeLoginpagesWithContentHash()
  *
- *              SECTION: SQL; Inserting in database
- * 1264:     function submitPage()
- * 1317:     function submit_grlist($hash,$phash_x)
- * 1335:     function submit_section($hash,$hash_t3)
- * 1361:     function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
- * 1402:     function submitFile_grlist($hash)
- * 1419:     function submitFile_section($hash)
- * 1436:     function checkWordList($wl)
- * 1473:     function submitWords($wl,$phash)
+ *              SECTION: SQL; Submitting words
+ * 1702:     function checkWordList($wl)
+ * 1739:     function submitWords($wl,$phash)
+ * 1763:     function freqMap($freq)
  *
  *              SECTION: Hashing
- * 1517:     function setT3Hashes()
- * 1540:     function setExtHashes($file,$subinfo=array())
- * 1563:     function md5inthash($str)
+ * 1796:     function setT3Hashes()
+ * 1822:     function setExtHashes($file,$subinfo=array())
+ * 1846:     function md5inthash($str)
+ * 1856:     function makeCHash($paramArray)
+ *
+ *              SECTION: Internal logging functions
+ * 1898:     function log_push($msg,$key)
+ * 1907:     function log_pull()
+ * 1918:     function log_setTSlogMessage($msg, $errorNum=0)
  *
- * TOTAL FUNCTIONS: 47
+ * TOTAL FUNCTIONS: 55
  * (This index is automatically created/updated by the extension "extdeveval")
  *
  */
 
 
-
-require_once(PATH_t3lib.'class.t3lib_htmlmail.php');
+require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
 
 
 /**
@@ -126,39 +143,20 @@ class tx_indexedsearch_indexer {
                3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
                4 => 'Page has never been indexed (is not represented in the index_phash table).'
        );
-       var $convChars=array(
-               'ÁÉÚÍÄËÜÖÏÆØÅ',
-               'áéúíâêûôîæøå'
-       );
 
                // HTML code blocks to exclude from indexing:
        var $excludeSections = 'script,style';
 
                // Supported Extensions for external files:
-       var $supportedExtensions = array(
-                       'pdf' => 1,
-                       'doc' => 1,
-                       'txt' => 1,
-                       'html' => 1,
-                       'htm' => 1
-               );
-
-               // This value is also overridden from config.
-       var $pdf_mode = -20;    // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
-
-               // This array is reset and configured in initialization:
-       var $app = array(
-               'pdftotext' => '/usr/local/bin/pdftotext',
-               'pdfinfo' => '/usr/local/bin/pdfinfo',
-               'catdoc' => '/usr/local/bin/catdoc'
-       );
+       var $external_parsers = array();                // External parser objects, keys are file extension names. Values are objects with certain methods.
 
                // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
-       var $defaultGrList='0,-1';
+       var $defaultGrList = '0,-1';
 
                // Min/Max times:
        var $tstamp_maxAge = 0;         // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
        var $tstamp_minAge = 0;         // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
+       var $maxExternalFiles = 0;      // Max number of external files to index.
 
                // INTERNALS:
        var $defaultContentArray=array(
@@ -168,54 +166,200 @@ class tx_indexedsearch_indexer {
                'body' => '',
        );
        var $wordcount = 0;
-       var $Itypes = array(
-               'html' => 1,
-               'htm' => 1,
-               'pdf' => 2,
-               'doc' => 3,
-               'txt' => 4
-       );
-       var $conf = array();    // Configuration set internally
-       var $hash = array();    // Hash array, contains phash and phash_grouping
-       var $contentParts = array();
-       var $pObj = '';                         // Parent object, reference to global TSFE
+       var $externalFileCounter = 0;
+
+       var $conf = array();            // Configuration set internally (see init functions for required keys and their meaning)
+       var $indexerConfig = array();   // Indexer configuration
+       var $hash = array();            // Hash array, contains phash and phash_grouping
+       var $file_phash_arr = array();  // Hash array for files
+       var $contentParts = array();    // Content of TYPO3 page
        var $content_md5h = '';
+       var $internal_log = array();    // Internal log
+       var $indexExternalUrl_content = '';
 
        var $cHashParams = array();     // cHashparams array
-       var $mtime = 0;                         // If set, then the mtime of the document must be different in order to be indexed.
-       var $rootLine = array();        // Root line from TSFE
 
        var $freqRange = 65000;
        var $freqMax = 0.1;
 
+               // Objects:
+       var $csObj;                             // Charset class object , t3lib_cs
+       var $metaphoneObj;              // Metaphone object, if any
+       var $lexerObj;                  // Lexer object for word splitting
 
 
 
        /**
-        * Parent Object (TSFE)
+        * Parent Object (TSFE) Initialization
         *
         * @param       object          Parent Object (frontend TSFE object), passed by reference
         * @return      void
         */
        function hook_indexContent(&$pObj)      {
 
+                       // Indexer configuration from Extension Manager interface:
+               $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
+
+                       // Determine if page should be indexed, and if so, configure and initialize indexer
                if ($pObj->config['config']['index_enable'])    {
-                       if (!$pObj->no_cache)   {
-                               $GLOBALS['TT']->push('Index page','');
+                       $this->log_push('Index page','');
 
-                                               // Setting parent object:
-                                       $this->pObj = &$pObj;
+                       if (!$indexerConfig['disableFrontendIndexing']) {
+                               if (!$pObj->page['no_search'])  {
+                                       if (!$pObj->no_cache)   {
 
-                                               // Init and start indexing:
-                                       $this->init();
-                                       $this->indexTypo3PageContent();
-                               $GLOBALS['TT']->pull();
-                       } else {
-                               $GLOBALS['TT']->push('Index page','');
-                               $GLOBALS['TT']->setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
-                               $GLOBALS['TT']->pull();
-                       }
+                                                       // Setting up internal configuration from config array:
+                                               $this->conf = array();
+
+                                                       // Information about page for which the indexing takes place
+                                               $this->conf['id'] = $pObj->id;                          // Page id
+                                               $this->conf['type'] = $pObj->type;                      // Page type
+                                               $this->conf['sys_language_uid'] = $pObj->sys_language_uid;      // sys_language UID of the language of the indexing.
+                                               $this->conf['MP'] = $pObj->MP;                          // MP variable, if any (Mount Points)
+                                               $this->conf['gr_list'] = $pObj->gr_list;        // Group list
+
+                                               $this->conf['cHash'] = $pObj->cHash;                                    // cHash string for additional parameters
+                                               $this->conf['cHash_array'] = $pObj->cHash_array;                // Array of the additional parameters
+
+                                               $this->conf['crdate'] = $pObj->page['crdate'];                  // The creation date of the TYPO3 page
+                                               $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;        // reg1 of the caching table. Not known what practical use this has.
+
+                                                       // Root line uids
+                                               $this->conf['rootline_uids'] = array();
+                                               foreach($pObj->config['rootLine'] as $rlkey => $rldat)  {
+                                                       $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
+                                               }
+
+                                                       // Content of page:
+                                               $this->conf['content'] = $pObj->content;                                        // Content string (HTML of TYPO3 page)
+                                               $this->conf['indexedDocTitle'] = $pObj->indexedDocTitle;        // Alternative title for indexing
+                                               $this->conf['metaCharset'] = $pObj->metaCharset;                        // Character set of content (will be converted to utf-8 during indexing)
+                                               $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];      // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
+
+                                                       // Configuration of behavior:
+                                               $this->conf['index_externals'] = $pObj->config['config']['index_externals'];    // Whether to index external documents like PDF, DOC etc. (if possible)
+                                               $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];              // Length of description text (max 250, default 200)
+
+                                                       // Set to zero:
+                                               $this->conf['recordUid'] = 0;
+                                               $this->conf['freeIndexUid'] = 0;
+
+                                                       // Init and start indexing:
+                                               $this->init();
+                                               $this->indexTypo3PageContent();
+
+                                       } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
+                               } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page header!');
+                       } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
                }
+               $this->log_pull();
+       }
+
+
+
+
+
+
+
+
+       /****************************
+        *
+        * Backend API
+        *
+        ****************************/
+
+       /**
+        * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
+        *
+        * @param       integer         The page uid, &id=
+        * @param       integer         The page type, &type=
+        * @param       integer         sys_language uid, typically &L=
+        * @param       string          The MP variable (Mount Points), &MP=
+        * @param       array           Rootline array of only UIDs.
+        * @param       array           Array of GET variables to register with this indexing
+        * @param       boolean         If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
+        * @return      void
+        */
+       function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)      {
+
+                       // Setting up internal configuration from config array:
+               $this->conf = array();
+
+                       // Information about page for which the indexing takes place
+               $this->conf['id'] = $id;                                // Page id      (integer)
+               $this->conf['type'] = $type;                    // Page type (integer)
+               $this->conf['sys_language_uid'] = $sys_language_uid;    // sys_language UID of the language of the indexing (integer)
+               $this->conf['MP'] = $MP;                                // MP variable, if any (Mount Points) (string)
+               $this->conf['gr_list'] = '0,-1';        // Group list (hardcoded for now...)
+
+                       // cHash values:
+               $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : '';      // cHash string for additional parameters
+               $this->conf['cHash_array'] = $cHash_array;              // Array of the additional parameters
+
+                       // Set to defaults
+               $this->conf['freeIndexUid'] = 0;
+               $this->conf['page_cache_reg1'] = '';
+
+                       // Root line uids
+               $this->conf['rootline_uids'] = $uidRL;
+
+                       // Configuration of behavior:
+               $this->conf['index_externals'] = 1;     // Whether to index external documents like PDF, DOC etc. (if possible)
+               $this->conf['index_descrLgd'] = 200;            // Length of description text (max 250, default 200)
+
+                       // Init and start indexing:
+               $this->init();
+       }
+
+       /**
+        * Sets the free-index uid. Can be called right after backend_initIndexer()
+        *
+        * @param       integer         Free index UID
+        * @return      void
+        */
+       function backend_setFreeIndexUid($freeIndexUid) {
+               $this->conf['freeIndexUid'] = $freeIndexUid;
+       }
+
+       /**
+        * Indexing records as the content of a TYPO3 page.
+        *
+        * @param       string          Title equivalent
+        * @param       string          Keywords equivalent
+        * @param       string          Description equivalent
+        * @param       string          The main content to index
+        * @param       string          The charset of the title, keyword, description and body-content
+        * @param       integer         Last modification time, in seconds
+        * @param       integer         The creation date of the content, in seconds
+        * @param       integer         The record UID that the content comes from (for registration with the indexed rows)
+        * @return      void
+        */
+       function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
+
+                       // Content of page:
+               $this->conf['mtime'] = $mtime;                  // Most recent modification time (seconds) of the content
+               $this->conf['crdate'] = $crdate;                // The creation date of the TYPO3 content
+               $this->conf['recordUid'] = $recordUid;  // UID of the record, if applicable
+
+                       // Construct fake HTML for parsing:
+               $this->conf['content'] = '
+               <html>
+                       <head>
+                               <title>'.htmlspecialchars($title).'</title>
+                               <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
+                               <meta name="description" content="'.htmlspecialchars($description).'" />
+                       </head>
+                       <body>
+                               '.htmlspecialchars($content).'
+                       </body>
+               </html>';                                       // Content string (HTML of TYPO3 page)
+
+                       // Initializing charset:
+               $this->conf['metaCharset'] = $charset;                  // Character set of content (will be converted to utf-8 during indexing)
+               $this->conf['indexedDocTitle'] = '';    // Alternative title for indexing
+
+                       // Index content as if it was a TYPO3 page:
+               $this->indexTypo3PageContent();
        }
 
 
@@ -228,6 +372,8 @@ class tx_indexedsearch_indexer {
 
 
 
+
+
        /********************************
         *
         * Initialization
@@ -235,70 +381,75 @@ class tx_indexedsearch_indexer {
         *******************************/
 
        /**
-        * Initializes the object
+        * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
         *
         * @return      void
         */
        function init() {
+               global $TYPO3_CONF_VARS;
 
                        // Initializing:
-               $this->cHashParams = $this->pObj->cHash_array;
+               $this->cHashParams = $this->conf['cHash_array'];
                if (is_array($this->cHashParams) && count($this->cHashParams))  {
-                       $this->cHashParams['cHash'] = $this->pObj->cHash;       // Add this so that URL's come out right...
+                       if ($this->conf['cHash'])       $this->cHashParams['cHash'] = $this->conf['cHash'];     // Add this so that URL's come out right...
+                       unset($this->cHashParams['encryptionKey']);             // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
                }
 
-                       // Modification time of page and root line transferred:
-               $this->mtime = $this->pObj->register['SYS_LASTCHANGED'];
-               $this->rootLine = $this->pObj->config['rootLine'];
-
-                       // Setting up internal configuration from config array:
-               $this->conf = array();
-               $this->conf['index_externals'] = $this->pObj->config['config']['index_externals'];
-               $this->conf['index_descrLgd'] = $this->pObj->config['config']['index_descrLgd'];
-
                        // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
                $this->setT3Hashes();
 
-                       // Initialize tools for reading PDF and Word documents:
-               $this->initExternalReaders();
+                       // Indexer configuration from Extension Manager interface:
+               $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
+               $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
+               $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
+               $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
+               $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
+
+                       // Initialize external document parsers:
+                       // Example configuration, see ext_localconf.php of this file!
+               if ($this->conf['index_externals'])     {
+                       $this->initializeExternalParsers();
+               }
+
+                       // Initialize lexer (class that deconstructs the text into words):
+                       // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
+               $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
+                                               $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
+                                               'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
+               $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
+               $this->lexerObj->debug = $this->indexerConfig['debugMode'];
+
+                       // Initialize metaphone hook:
+                       // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
+               if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
+                       $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
+               }
+
+                       // Init charset class:
+               $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
        }
 
        /**
-        * Initializes external readers, if any
+        * Initialize external parsers
         *
         * @return      void
+        * @access private
+        * @see init()
         */
-       function initExternalReaders()  {
-                       // PDF + WORD tools:
-                       // First reset the class default settings (disabling)
-               $this->app = array();
-               $this->supportedExtensions['pdf'] = 0;
-               $this->supportedExtensions['doc'] = 0;
-
-                       // Then read indexer-config and set if appropriate:
-               $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
+       function initializeExternalParsers()    {
+               global $TYPO3_CONF_VARS;
 
-                       // PDF
-               if ($indexerConfig['pdftools']) {
-                       $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
-                       if ((ini_get('safe_mode') && $pdfPath) || (@is_file($pdfPath.'pdftotext') && @is_file($pdfPath.'pdfinfo')))     {
-                               $this->app['pdfinfo'] = $pdfPath.'pdfinfo';
-                               $this->app['pdftotext'] = $pdfPath.'pdftotext';
-                               $this->supportedExtensions['pdf'] = 1;
-                       } else $GLOBALS['TT']->setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
-               } else $GLOBALS['TT']->setTSlogMessage('PDF tools disabled',1);
-
-                       // Catdoc
-               if ($indexerConfig['catdoc'])   {
-                       $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
-                       if (is_file($catdocPath.'catdoc'))      {
-                               $this->app['catdoc'] = $catdocPath.'catdoc';
-                               $this->supportedExtensions['doc'] = 1;
-                       } else $GLOBALS['TT']->setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3);
-               } else $GLOBALS['TT']->setTSlogMessage('catdoc tools (Word-files) disabled',1);
-
-                       // PDF mode:
-               $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
+               if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers']))        {
+                       foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef)    {
+                               $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
+                               $this->external_parsers[$extension]->pObj = &$this;
+
+                                       // Init parser and if it returns false, unset its entry again:
+                               if (!$this->external_parsers[$extension]->initParser($extension))       {
+                                       unset($this->external_parsers[$extension]);
+                               }
+                       }
+               }
        }
 
 
@@ -311,9 +462,13 @@ class tx_indexedsearch_indexer {
 
 
 
+
+
+
+
        /********************************
         *
-        * Indexing
+        * Indexing; TYPO3 pages (HTML content)
         *
         *******************************/
 
@@ -324,93 +479,101 @@ class tx_indexedsearch_indexer {
         */
        function indexTypo3PageContent()        {
 
-               $check = $this->checkMtimeTstamp($this->mtime, $this->tstamp_maxAge, $this->tstamp_minAge, $this->hash['phash']);
-# WHAT IS THIS? Test that it works...          $is_grlist = $this->is_grlist_set($phash_x);    // Use $this->hash['phash']?
+               $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
+               $is_grlist = $this->is_grlist_set($this->hash['phash']);
 
                if ($check > 0 || !$is_grlist)  {
 
                                // Setting message:
                        if ($check > 0) {
-                               $GLOBALS['TT']->setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
+                               $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
                        } else {
-                               $GLOBALS['TT']->setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
+                               $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
                        }
 
                                        // Divide into title,keywords,description and body:
-                       $GLOBALS['TT']->push('Split content','');
-                               $this->contentParts = $this->splitHTMLContent($this->pObj->content);
-                               if ($this->pObj->indexedDocTitle)       $this->contentParts['title'] = $this->pObj->indexedDocTitle;
-                       $GLOBALS['TT']->pull();
+                       $this->log_push('Split content','');
+                               $this->contentParts = $this->splitHTMLContent($this->conf['content']);
+                               if ($this->conf['indexedDocTitle'])     {
+                                       $this->contentParts['title'] = $this->conf['indexedDocTitle'];
+                               }
+                       $this->log_pull();
 
                                // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
                        $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
+
                                // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
                                // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
                                // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
                        $checkCHash = $this->checkContentHash();
-                       if (!is_array($checkCHash))     {
+                       if (!is_array($checkCHash) || $check===1)       {
                                $Pstart=t3lib_div::milliseconds();
+
+                               $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
+                                       $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
+                               $this->log_pull();
+
                                                // Splitting words
-                               $GLOBALS['TT']->push('Extract words from content','');
+                               $this->log_push('Extract words from content','');
                                        $splitInWords = $this->procesWordsInArrays($this->contentParts);
-                               $GLOBALS['TT']->pull();
+                               $this->log_pull();
 
                                                // Analyse the indexed words.
-                               $GLOBALS['TT']->push('Analyse the extracted words','');
+                               $this->log_push('Analyse the extracted words','');
                                        $indexArr = $this->indexAnalyze($splitInWords);
-                               $GLOBALS['TT']->pull();
+                               $this->log_pull();
 
                                                // Submitting page (phash) record
-                               $GLOBALS['TT']->push('Submitting page','');
+                               $this->log_push('Submitting page','');
                                        $this->submitPage();
-                               $GLOBALS['TT']->pull();
+                               $this->log_pull();
 
                                                // Check words and submit to word list if not there
-                               $GLOBALS['TT']->push('Check word list and submit words','');
+                               $this->log_push('Check word list and submit words','');
                                        $this->checkWordList($indexArr);
                                        $this->submitWords($indexArr,$this->hash['phash']);
-                               $GLOBALS['TT']->pull();
+                               $this->log_pull();
 
                                                // Set parsetime
                                $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
 
                                                // Checking external files if configured for.
-                               $GLOBALS['TT']->push('Checking external files','');
+                               $this->log_push('Checking external files','');
                                if ($this->conf['index_externals'])     {
-                                       $this->extractLinks($this->pObj->content);
+                                       $this->extractLinks($this->conf['content']);
                                }
-                               $GLOBALS['TT']->pull();
+                               $this->log_pull();
                        } else {
-                               $this->updateTstamp($this->hash['phash'],$this->mtime); // Update the timestatmp
-                               $this->update_grlist($checkCHash['phash'],$this->hash['phash']);
+                               $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
+                               $this->update_grlist($checkCHash['phash'],$this->hash['phash']);        // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
                                $this->updateRootline();
-                               $GLOBALS['TT']->setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
+                               $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
                        }
                } else {
-                       $GLOBALS['TT']->setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
+                       $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
                }
        }
 
        /**
         * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
         *
-        * @param       [type]          $content: ...
-        * @return      [type]          ...
+        * @param       string          HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
+        * @return      array           Array of content, having keys "title", "body", "keywords" and "description" set.
+        * @see splitRegularContent()
         */
        function splitHTMLContent($content) {
 
-               # divide head from body ( u-ouh :) )
-
-               $contentArr=$this->defaultContentArray;
+                       // divide head from body ( u-ouh :) )
+               $contentArr = $this->defaultContentArray;
                $contentArr['body'] = stristr($content,'<body');
                $headPart = substr($content,0,-strlen($contentArr['body']));
 
-               # get title
+                       // get title
                $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
                $titleParts = explode(':',$contentArr['title'],2);
                $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
 
-               # get keywords and description metatags
+                       // get keywords and description metatags
                for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
                for($i=0;isset($meta[$i]);$i++) {
                        $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
@@ -418,452 +581,202 @@ class tx_indexedsearch_indexer {
                        if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
                }
 
+                       // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
                $this->typoSearchTags($contentArr['body']);
 
-               # get rid of unwanted sections (ie. scripting and style stuff) in body
+                       // Get rid of unwanted sections (ie. scripting and style stuff) in body
                $tagList = explode(',',$this->excludeSections);
-               reset($tagList);
-               while(list(,$tag)=each($tagList)) {
+               foreach($tagList as $tag)       {
                        while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
                }
 
-               # remove tags, but first make sure we don't concatenate words by doing it
+                       // remove tags, but first make sure we don't concatenate words by doing it
                $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
                $contentArr['body'] = trim(strip_tags($contentArr['body']));
 
                $contentArr['keywords'] = trim($contentArr['keywords']);
                $contentArr['description'] = trim($contentArr['description']);
-               # ta-dah!
-               return $contentArr;
-       }
-
-       /**
-        * Splits non-HTML content
-        *
-        * @param       [type]          $content: ...
-        * @return      [type]          ...
-        */
-       function splitRegularContent($content) {
-               $contentArr = $this->defaultContentArray;
-               $contentArr['body'] = $content;
-
-               return $contentArr;
-       }
-
-       /**
-        * Processing words in the array from split*Content -functions
-        *
-        * @param       [type]          $contentArr: ...
-        * @return      [type]          ...
-        */
-       function procesWordsInArrays($contentArr)       {
-
-               # split all parts to words
-               reset($contentArr);
-               while(list($key,)=each($contentArr)) {
-                       if (function_exists('html_entity_decode'))              $contentArr[$key] = html_entity_decode($contentArr[$key]);
-                       $contentArr[$key] = $this->strtolower_all($contentArr[$key]);
-                       $this->split2words($contentArr[$key]);
-               }
 
-               # for title, keywords, and description we don't want duplicates
-               $contentArr['title'] = array_unique($contentArr['title']);
-               $contentArr['keywords'] = array_unique($contentArr['keywords']);
-               $contentArr['description'] = array_unique($contentArr['description']);
+                       // Return array
                return $contentArr;
        }
 
        /**
-        * Returns bodyDescription
-        *
-        * @param       [type]          $contentArr: ...
-        * @return      [type]          ...
-        */
-       function bodyDescription($contentArr)   {
-               # Setting description
-               $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
-               if ($maxL)      {
-                       if (function_exists('html_entity_decode'))              $bodyDescription = html_entity_decode(trim($contentArr['body']));
-                       $bodyDescription = implode(' ',split('[[:space:],]+',substr($bodyDescription,0,$maxL*2)));      // Takes the double lenght first, because whitespace may be removed and thus shorten the string more yet.
-                       $bodyDescription=substr($bodyDescription,0,$maxL);
-               }
-               return $bodyDescription;
-       }
-
-       /**
-        * extract links and if indexable media is found, it is indexed
+        * Extract the charset value from HTML meta tag.
         *
-        * @param       [type]          $content: ...
-        * @return      [type]          ...
+        * @param       string          HTML content
+        * @return      string          The charset value if found.
         */
-       function extractLinks($content) {
-               $extract = t3lib_div::makeInstance('t3lib_htmlmail');
-               $extract->extractHtmlInit($content,'');
-               $extract->extractHyperLinks();
-#debug($extract->theParts['html']['hrefs']);
-               if (is_array($extract->theParts['html']['hrefs']))      {
-                       reset($extract->theParts['html']['hrefs']);
-                       while(list(,$linkInfo)=each($extract->theParts['html']['hrefs']))       {
-                               $linkInfo['ref'] = t3lib_div::htmlspecialchars_decode($linkInfo['ref']);
-#debug($linkInfo['ref'],1);
-                               if (strstr($linkInfo['ref'],'?') && strstr($linkInfo['ref'],'jumpurl='))        {
-                                       $qParts = parse_url($linkInfo['ref']);
-#debug($qParts);
-                                       $theJumpurlFile = $this->getJumpurl($qParts['query']);
-//                                     debug($theJumpurlFile);
-                                       if ($theJumpurlFile && @is_file($theJumpurlFile))       {
-       //                                      debug($theJumpurlFile);
-                                               $this->indexRegularDocument($theJumpurlFile);
-                                       }
-                               } elseif (@is_file($linkInfo['ref']))   {
-                                       $this->indexRegularDocument($linkInfo['ref']);
-                               }
+       function getHTMLcharset($content)       {
+               if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg))       {
+                       if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2))     {
+                               return $reg2[1];
                        }
                }
        }
 
        /**
-        * [Describe function...]
-        *
-        * @param       [type]          $query: ...
-        * @return      [type]          ...
-        */
-       function getJumpurl($query)     {
-               $res = parse_str($query);
-#              debug(array($res),'getJumpurl');
-
-               return $jumpurl;
-       }
-
-       /**
-        * Splitting PDF info
+        * Converts a HTML document to utf-8
         *
-        * @param       [type]          $pdfInfoArray: ...
-        * @return      [type]          ...
+        * @param       string          HTML content, any charset
+        * @param       string          Optional charset (otherwise extracted from HTML)
+        * @return      string          Converted HTML
         */
-       function splitPdfInfo($pdfInfoArray)    {
-               $res = array();
-               if (is_array($pdfInfoArray))    {
-                       reset($pdfInfoArray);
-                       while(list(,$line)=each($pdfInfoArray)) {
-                               $parts = explode(':',$line,2);
-                               if (count($parts)>1 && trim($parts[0])) {
-                                       $res[strtolower(trim($parts[0]))] = trim($parts[1]);
-                               }
-                       }
-               }
-               return $res;
-       }
+       function convertHTMLToUtf8($content,$charset='')        {
 
-       /**
-        * Indexing a regular document given as $file (relative to PATH_site, local file)
-        *
-        * @param       [type]          $file: ...
-        * @return      [type]          ...
-        */
-       function indexRegularDocument($file)    {
-                       // init
-               $fI=pathinfo($file);
-               $ext = strtolower($fI['extension']);
-               $absFile = PATH_site.$file;
-#debug($file);
-                       //
-               if (@is_file($absFile) && $this->supportedExtensions[$ext])     {
-                       $mtime = filemtime($absFile);
-                       $cParts = $this->fileContentParts($ext,$absFile);
-//                     debug($cParts);
-                       reset($cParts);
-                       while(list(,$cPKey)=each($cParts))      {
-                               $GLOBALS['TT']->push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
-                               $Pstart = t3lib_div::milliseconds();
-                               $subinfo=array('key'=>$cPKey);
-                               $phash_arr = $this->setExtHashes($file,$subinfo);
-//                             debug($phash_arr);
-
-                               $check = $this->checkMtimeTstamp($mtime, $this->tstamp_maxAge, $this->tstamp_minAge, $phash_arr['phash']);
-                               if ($check > 0) {
-                                       $GLOBALS['TT']->setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
-                                                       // Divide into title,keywords,description and body:
-                                       $GLOBALS['TT']->push('Split content','');
-                                               $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
-#debug($contentParts);
-                                       $GLOBALS['TT']->pull();
-                                       if (is_array($contentParts))    {
-                                                       // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
-                                               $content_md5h = $this->md5inthash(implode($contentParts,''));
-
-                                               if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h))    {
-                                                                       // Splitting words
-                                                       $GLOBALS['TT']->push('Extract words from content','');
-                                                               $splitInWords = $this->procesWordsInArrays($contentParts);
-                                                       $GLOBALS['TT']->pull();
-
-                                                                       // Analyse the indexed words.
-                                                       $GLOBALS['TT']->push('Analyse the extracted words','');
-                                                               $indexArr = $this->indexAnalyze($splitInWords);
-                                                       $GLOBALS['TT']->pull();
-
-                                                                       // Submitting page (phash) record
-                                                       $GLOBALS['TT']->push('Submitting page','');
-                                                               $size=filesize($absFile);
-                                                               $ctime=filemtime($absFile);     // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
-                                                               $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
-                                                       $GLOBALS['TT']->pull();
-
-                                                                       // Check words and submit to word list if not there
-                                                       $GLOBALS['TT']->push('Check word list and submit words','');
-                                                               $this->checkWordList($indexArr);
-                                                               $this->submitWords($indexArr,$phash_arr['phash']);
-                                                       $GLOBALS['TT']->pull();
-
-                                                               // Set parsetime
-                                                       $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
-                                               } else {
-                                                       $this->updateTstamp($phash_arr['phash'],$mtime);        // Update the timestamp
-                                                       $GLOBALS['TT']->setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
-                                               }
-                                       } else {
-                                               $GLOBALS['TT']->setTSlogMessage('Could not index file! Unsupported extension.');
-                                       }
-                               } else {
-                                       $GLOBALS['TT']->setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
-                               }
-                                       // Checking and setting sections:
-       #                       $this->submitFile_grlist($phash_arr['phash']);  // Setting a gr_list record if there is none already (set for default fe_group)
-                               $this->submitFile_section($phash_arr['phash']);         // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
-                               $GLOBALS['TT']->pull();
-                       }
-               }
-       }
+                       // Find charset:
+               $charset = $charset ? $charset : $this->getHTMLcharset($content);
+               $charset = $this->csObj->parse_charset($charset);
 
-       /**
-        * [Describe function...]
-        *
-        * @param       [type]          $ext: ...
-        * @param       [type]          $absFile: ...
-        * @param       [type]          $cPKey: ...
-        * @return      [type]          ...
-        */
-       function readFileContent($ext,$absFile,$cPKey)  {
-               switch ($ext)   {
-                       case 'pdf':
-                               if ($this->app['pdfinfo'])      {
-#debug($this->app);
-                                               // Getting pdf-info:
-                                       $cmd = $this->app['pdfinfo'].' '.$absFile;
-                                       exec($cmd,$res);
-                                       $pdfInfo=$this->splitPdfInfo($res);
-
-                                       if (intval($pdfInfo['pages']))  {
-                                               list($low,$high) = explode('-',$cPKey);
-
-                                                       // Get pdf content:
-                                               $tempFileName = t3lib_div::tempnam('Typo3_indexer');            // Create temporary name
-                                               @unlink ($tempFileName);        // Delete if exists, just to be safe.
-                                               $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -q '.$absFile.' '.$tempFileName;
-       //                                      debug($cmd,1);
-                                               exec($cmd,$res);
-                                               if (@is_file($tempFileName))    {
-                                                       $content = t3lib_div::getUrl($tempFileName);
-                                                       unlink($tempFileName);
-                                               } else {
-                                                       $GLOBALS['TT']->setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
-                                               }
-                                               $contentArr = $this->splitRegularContent($content);
-                                       }
-                               }
-                       break;
-                       case 'doc':
-                               if ($this->app['catdoc'])       {
-                                       $cmd = $this->app['catdoc'].' '.$absFile;
-                                       exec($cmd,$res);
-                                       $content = implode(chr(10),$res);
-                                       $contentArr = $this->splitRegularContent($content);
-                               }
-                       break;
-                       case 'txt':
-                               $content = t3lib_div::getUrl($absFile);
-                               $contentArr = $this->splitRegularContent($content);
-                       break;
-                       case 'html':
-                       case 'htm':
-                               $fileContent = t3lib_div::getUrl($absFile);
-                               $contentArr = $this->splitHTMLContent($fileContent);
-                       break;
-                       default:
-                               return false;
-                       break;
-               }
-                       // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
-               if (!$contentArr['title'])      {
-                       $contentArr['title']=str_replace('_',' ',basename($absFile));   // Substituting "_" for " " because many filenames may have this instead of a space char.
+                       // Convert charset:
+               if ($charset && $charset!=='utf-8')     {
+                       $content = $this->csObj->utf8_encode($content, $charset);
                }
-               return $contentArr;
-       }
+                       // Convert entities, assuming document is now UTF-8:
+               $content = $this->csObj->entities_to_utf8($content, TRUE);
 
-       /**
-        * [Describe function...]
-        *
-        * @param       [type]          $ext: ...
-        * @param       [type]          $absFile: ...
-        * @return      [type]          ...
-        */
-       function fileContentParts($ext,$absFile)        {
-               $cParts=array(0);
-               switch ($ext)   {
-                       case 'pdf':
-                                       // Getting pdf-info:
-                               $cmd = $this->app['pdfinfo'].' '.$absFile;
-                               exec($cmd,$res);
-                               $pdfInfo=$this->splitPdfInfo($res);
-                       //      debug($pdfInfo);
-
-                               if (intval($pdfInfo['pages']))  {
-                                       $cParts=array();
-                                               // Calculate mode
-                                               // Calculate mode
-                                       if ($this->pdf_mode>0)  {
-                                               $iter=ceil($pdfInfo['pages']/$this->pdf_mode);
-                                       } else {
-                                               $iter=t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
-                                       }
-                                       for ($a=0;$a<$iter;$a++)        {
-                                               $low=floor($a*($pdfInfo['pages']/$iter))+1;
-                                               $high=floor(($a+1)*($pdfInfo['pages']/$iter));
-                                               $cParts[]=$low.'-'.$high;
-                                       }
-                               }
-                       break;
-               }
-               return $cParts;
+               return $content;
        }
 
-
        /**
         * Finds first occurence of embracing tags and returns the embraced content and the original string with
         * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
         * <title> of document or removing <script>-sections
         *
-        * @param       [type]          $string: ...
-        * @param       [type]          $tagName: ...
-        * @param       [type]          $tagContent: ...
-        * @param       [type]          $stringAfter: ...
-        * @param       [type]          $paramList: ...
-        * @return      [type]          ...
+        * @param       string          String to search in
+        * @param       string          Tag name, eg. "script"
+        * @param       string          Passed by reference: Content inside found tag
+        * @param       string          Passed by reference: Content after found tag
+        * @param       string          Passed by reference: Attributes of the found tag.
+        * @return      boolean         Returns false if tag was not found, otherwise true.
         */
        function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
                $endTag = '</'.$tagName.'>';
                $startTag = '<'.$tagName;
+
                $isTagInText = stristr($string,$startTag);              // stristr used because we want a case-insensitive search for the tag.
                if(!$isTagInText) return false; // if the tag was not found, return false
 
                list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
                $afterTagInText = stristr($isTagInText,$endTag);
                if ($afterTagInText)    {
-                       $tagContent = substr($isTagInText,0,-strlen($afterTagInText));
-                       $stringAfter = substr($afterTagInText,strlen($endTag));
+                       $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
+                       $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
+                       $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
                } else {        // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
                        $tagContent='';
                        $stringAfter = $isTagInText;
                }
-//             debug(array($tagContent,$stringAfter));
-               return true;
-       }
-
-       /**
-        * Analyzes content to use for indexing,
-        * the parameter must be an array with the keys title,keywords,description and body, which all contain an array of words.
-        *
-        * @param       [type]          $content: ...
-        * @return      [type]          ...
-        */
-       function indexAnalyze($content) {
-               $indexArr = Array();
-               $counter = 0;
-
-               $this->analyzeHeaderinfo($indexArr,$content,'title',7);
-               $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
-               $this->analyzeHeaderinfo($indexArr,$content,'description',5);
-               $this->analyzeBody($indexArr,$content);
-
-               return ($indexArr);
-       }
-
-       /**
-        * Calculates relevant information for headercontent
-        *
-        * @param       [type]          $$retArr: ...
-        * @param       [type]          $content: ...
-        * @param       [type]          $key: ...
-        * @param       [type]          $offset: ...
-        * @return      [type]          ...
-        */
-       function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
-               reset($content[$key]);
-               while(list(,$val)=each($content[$key]))  {
-                       $val = substr($val,0,30);       // Max 30 - because the baseword varchar IS 30. This MUST be the same.
-                       $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
-                       $retArr[$val]['count'] = $retArr[$val]['count']+1;
-                       $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
-                       $retArr[$val]['metaphone'] = $this->metaphone($val);
-                       $this->wordcount++;
-               }
-       }
 
-       /**
-        * Calculates relevant information for bodycontent
-        *
-        * @param       [type]          $$retArr: ...
-        * @param       [type]          $content: ...
-        * @return      [type]          ...
-        */
-       function analyzeBody(&$retArr,$content) {
-               reset($content['body']);
-               while(list($key,$val)=each($content['body']))  {
-                       $val = substr($val,0,30);       // Max 30 - because the baseword varchar IS 30. This MUST be the same.
-                       if(!isset($retArr[$val])) {
-                               $retArr[$val]['first']=$key;
-                               $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
-                               $retArr[$val]['metaphone'] = $this->metaphone($val);
-                       }
-                       $retArr[$val]['count'] = $retArr[$val]['count']+1;
-                       $this->wordcount++;
-               }
+               return true;
        }
 
        /**
         * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
         *
-        * @param       [type]          $$body: ...
-        * @return      [type]          ...
+        * @param       string          HTML Content, passed by reference
+        * @return      boolean         Returns true if a TYPOSEARCH_ tag was found, otherwise false.
         */
        function typoSearchTags(&$body) {
                $expBody = explode('<!--TYPO3SEARCH_',$body);
-#debug($expBody);
+
                if(count($expBody)>1) {
                        $body = '';
-                       reset($expBody);
-                       while(list(,$val)=each($expBody)) {
+
+                       foreach($expBody as $val)       {
                                $part = explode('-->',$val,2);
                                if(trim($part[0])=='begin') {
-                                       $body .= $part[1];
+                                       $body.= $part[1];
                                        $prev = '';
                                } elseif(trim($part[0])=='end') {
-                                       $body .= $prev;
+                                       $body.= $prev;
                                } else {
                                        $prev = $val;
                                }
-#debug($part);
                        }
-#debug(array($body));
                        return true;
                } else {
                        return false;
                }
        }
 
+       /**
+        * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
+        *
+        * @param       string          HTML content
+        * @return      void
+        */
+       function extractLinks($content) {
+
+                       // Get links:
+               $list = $this->extractHyperLinks($content);
+
+                       // Traverse links:
+               foreach($list as $linkInfo)     {
+
+                               // Decode entities:
+                       $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
+
+                               // Parse URL:
+                       $qParts = parse_url($linkSource);
+
+                               // Check for jumpurl (TYPO3 specific thing...)
+                       if ($qParts['query'] && strstr($qParts['query'],'jumpurl='))    {
+                               parse_str($qParts['query'],$getP);
+                               $linkSource = $getP['jumpurl'];
+                               $qParts = parse_url($linkSource);       // parse again due to new linkSource!
+                       }
+
+                       if ($qParts['scheme'])  {
+                               if ($this->indexerConfig['indexExternalURLs'])  {
+                                               // Index external URL (http or otherwise)
+                                       $this->indexExternalUrl($linkSource);
+                               }
+                       } elseif (!$qParts['query']) {
+                               $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
+                               if ($localFile && @is_file($localFile)) {
+                                               // Index local file:
+                                       $this->indexRegularDocument($linkSource);
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Extracts all links to external documents from content string.
+        *
+        * @param       string          Content to analyse
+        * @return      array           Array of hyperlinks
+        * @see extractLinks()
+        */
+       function extractHyperLinks($string)     {
+               if (!is_object($this->htmlParser))      {
+                       $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
+               }
 
+               $parts = $this->htmlParser->splitTags('a',$string);
+               $list = array();
+               foreach($parts as $k => $v)     {
+                       if ($k%2)       {
+                               $params = $this->htmlParser->get_tag_attributes($v,1);
+                               $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
+
+                               switch(strtolower($firstTagName))       {
+                                       case 'a':
+                                               $src = $params[0]['href'];
+                                               if ($src)       {
+                                                       $list[] = array(
+                                                               'tag' => $v,
+                                                               'href' => $params[0]['href']
+                                                       );
+                                               }
+                                       break;
+                               }
+                       }
+               }
 
+               return $list;
+       }
 
 
 
@@ -875,87 +788,82 @@ class tx_indexedsearch_indexer {
 
 
 
-       /**********************************
+       /******************************************
         *
-        * Words
+        * Indexing; external URL
         *
-        **********************************/
+        ******************************************/
 
        /**
-        * Splits the incoming string into words
-        * The $string parameter is a reference and will be made into an array!
+        * Index External URLs HTML content
         *
-        * @param       [type]          $$string: ...
-        * @return      [type]          ...
+        * @param       string          URL, eg. "http://typo3.org/"
+        * @return      void
+        * @see indexRegularDocument()
         */
-       function split2words(&$string) {
-               $words = split('[[:space:],]+',$string);
-               $reg='['.quotemeta('().,_?!:-').']*';
-               $reg='[^[:alnum:]'.$this->convChars[0].$this->convChars[1].']*';
-
-#debug($words);
-#debug(array($string));
-               reset($words);
-               $matches=array();
-               while(list(,$w)=each($words))   {
-                       $w=trim($w);
-                       $w=ereg_replace('^'.$reg,'',$w);
-                       $w=ereg_replace($reg.'$','',$w);
-                       if ($this->wordOK($w))  {$matches[]=$w;}
-               }
-#              debug($matches);
-               $string =$matches;
+       function indexExternalUrl($externalUrl) {
 
+                       // Parse External URL:
+               $qParts = parse_url($externalUrl);
+               $fI = pathinfo($qParts['path']);
+               $ext = strtolower($fI['extension']);
 
-               /*
-               preg_match_all("/\b(\w[\w']*\w+|\w+)\b/", $string ,$matches);
-               $string = $matches[0];
-               */
-       }
+                       // Get headers:
+               $urlHeaders = $this->getUrlHeaders($externalUrl);
+               if (stristr($urlHeaders['Content-Type'],'text/html'))   {
+                       $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
+                       if (strlen($content))   {
 
-       /**
-        * Checks if a word is supposed to be indexed.
-        * This assessment includes that the word must be between 1 and 50 chars.
-        * The more exotic feature is that only 30 percent of the word must be non-alphanum characters. This is to exclude binary nonsense. This is done with a little trick it's counted how many chars are converted with a rawurlencode command. THis is not really an exact method, but I guess it's fast.
-        *
-        * @param       [type]          $w: ...
-        * @return      [type]          ...
-        */
-       function wordOK($w)     {
-               if ($w && strlen($w)>1 && strlen($w)<50)        {
-                       if (rawurlencode($w)!=$w)       {
-                               $fChars = count(explode('%',rawurlencode($w)))-1;
-                               $rel = round($fChars/strlen($w)*100);
-                               return $rel<30 ? 1 : 0;         // Max 30% strange chars!
-                       } else {
-                               return 1;
+                                       // Create temporary file:
+                               $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
+                               t3lib_div::writeFile($tmpFile, $content);
+
+                                       // Index that file:
+                               $this->indexRegularDocument($externalUrl, FALSE, $tmpFile, 'html');
+                               unlink($tmpFile);
                        }
                }
        }
 
        /**
-        * metaphone
+        * Getting HTTP request headers of URL
         *
-        * @param       [type]          $word: ...
-        * @return      [type]          ...
+        * @param       string          The URL
+        * @param       integer         Timeout (seconds?)
+        * @return      mixed           If no answer, returns false. Otherwise an array where HTTP headers are keys
         */
-       function metaphone($word) {
-               $tmp = metaphone($word);
-               if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
-               return $ret;
-       }
-
-       /**
-        * Converts string-to-lower including special characters.
-        *
-        * @param       [type]          $str: ...
-        * @return      [type]          ...
-        */
-       function strtolower_all($str)   {
-               return strtolower(strtr($str, $this->convChars[0], $this->convChars[1]));
-       }
+       function getUrlHeaders($url, $timeout = 2)      {
+               $url = parse_url($url);
 
+               if(!in_array($url['scheme'],array('','http')))  return FALSE;
 
+               $fp = fsockopen ($url['host'], ($url['port'] > 0 ? $url['port'] : 80), $errno, $errstr, $timeout);
+               if (!$fp)       {
+                       return FALSE;
+               } else {
+                       $msg = "GET ".$url['path'].($url['query'] ? '?'.$url['query'] : '')." HTTP/1.0\r\nHost: ".$url['host']."\r\n\r\n";
+                       fputs ($fp, $msg);
+                       $d = '';
+                       while (!feof($fp)) {
+                               $line = fgets ($fp,2048);
+
+                               $d.=$line;
+                               if (!strlen(trim($line)))       {
+                                       break;
+                               }
+                       }
+                       fclose ($fp);
+
+                               // Compile headers:
+                       $headers = t3lib_div::trimExplode(chr(10),$d,1);
+                       $retVal = array();
+                       foreach($headers as $line)      {
+                               list($headKey, $headValue) = explode(':', $line, 2);
+                               $retVal[$headKey] = $headValue;
+                       }
+                       return $retVal;
+               }
+       }
 
 
 
@@ -969,52 +877,168 @@ class tx_indexedsearch_indexer {
 
 
 
-       /********************************
+       /******************************************
         *
-        * SQL Helper functions
+        * Indexing; external files (PDF, DOC, etc)
         *
-        *******************************/
+        ******************************************/
 
        /**
-        * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
-        * and back.
+        * Indexing a regular document given as $file (relative to PATH_site, local file)
         *
-        * @param       [type]          $freq: ...
-        * @return      [type]          ...
+        * @param       string          Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
+        * @param       boolean         If set, indexing is forced (despite content hashes, mtime etc).
+        * @param       string          Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
+        * @param       string          File extension for temporary file.
+        * @return      void
         */
-       function freqMap($freq) {
-               $mapFactor = $this->freqMax*100*$this->freqRange;
-               if($freq<1) {
-                       $newFreq = $freq*$mapFactor;
-                       $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
+       function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')        {
+
+                       // Init
+               $fI = pathinfo($file);
+               $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
+
+                       // Create abs-path:
+               if (!$contentTmpFile)   {
+                       if (!t3lib_div::isAbsPath($file))       {       // Relative, prepend PATH_site:
+                               $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
+                       } else {        // Absolute, pass-through:
+                               $absFile = $file;
+                       }
+                       $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
                } else {
-                       $newFreq = $freq/$mapFactor;
+                       $absFile = $contentTmpFile;
                }
-               return $newFreq;
 
+                       // Indexing the document:
+               if ($absFile &&  @is_file($absFile))    {
+                       if ($this->external_parsers[$ext])      {
+                               $mtime = filemtime($absFile);
+                               $cParts = $this->fileContentParts($ext,$absFile);
+
+                               foreach($cParts as $cPKey)      {
+                                       $this->internal_log = array();
+                                       $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
+                                       $Pstart = t3lib_div::milliseconds();
+                                       $subinfo = array('key' => $cPKey);
+                                       $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
+                                       $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
+                                       if ($check > 0 || $force)       {
+                                               if ($check > 0) {
+                                                       $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
+                                               } else {
+                                                       $this->log_setTSlogMessage('Indexing forced by flag',1);
+                                               }
+
+                                                       // Check external file counter:
+                                               if ($this->externalFileCounter < $this->maxExternalFiles || $force)     {
+
+                                                                       // Divide into title,keywords,description and body:
+                                                       $this->log_push('Split content','');
+                                                               $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
+                                                       $this->log_pull();
+
+                                                       if (is_array($contentParts))    {
+                                                                       // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
+                                                               $content_md5h = $this->md5inthash(implode($contentParts,''));
+
+                                                               if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force)  {
+
+                                                                               // Increment counter:
+                                                                       $this->externalFileCounter++;
+
+                                                                               // Splitting words
+                                                                       $this->log_push('Extract words from content','');
+                                                                               $splitInWords = $this->procesWordsInArrays($contentParts);
+                                                                       $this->log_pull();
+
+                                                                               // Analyse the indexed words.
+                                                                       $this->log_push('Analyse the extracted words','');
+                                                                               $indexArr = $this->indexAnalyze($splitInWords);
+                                                                       $this->log_pull();
+
+                                                                               // Submitting page (phash) record
+                                                                       $this->log_push('Submitting page','');
+                                                                               $size = filesize($absFile);
+                                                                               $ctime = filemtime($absFile);   // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
+                                                                               $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
+                                                                       $this->log_pull();
+
+                                                                               // Check words and submit to word list if not there
+                                                                       $this->log_push('Check word list and submit words','');
+                                                                               $this->checkWordList($indexArr);
+                                                                               $this->submitWords($indexArr,$phash_arr['phash']);
+                                                                       $this->log_pull();
+
+                                                                               // Set parsetime
+                                                                       $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
+                                                               } else {
+                                                                       $this->updateTstamp($phash_arr['phash'],$mtime);        // Update the timestamp
+                                                                       $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
+                                                               }
+                                                       } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
+                                               } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
+                                       } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
+
+                                               // Checking and setting sections:
+               #                       $this->submitFile_grlist($phash_arr['phash']);  // Setting a gr_list record if there is none already (set for default fe_group)
+                                       $this->submitFile_section($phash_arr['phash']);         // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
+                                       $this->log_pull();
+                               }
+                       } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
+               } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
        }
 
        /**
-        * [Describe function...]
+        * Reads the content of an external file being indexed.
+        * The content from the external parser MUST be returned in utf-8!
         *
-        * @param       [type]          $$fieldArr: ...
-        * @return      [type]          ...
+        * @param       string          File extension, eg. "pdf", "doc" etc.
+        * @param       string          Absolute filename of file (must exist and be validated OK before calling function)
+        * @param       string          Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
+        * @return      array           Standard content array (title, description, keywords, body keys)
         */
-       function getRootLineFields(&$fieldArr)  {
-               $rl = $this->rootLine;
+       function readFileContent($ext,$absFile,$cPKey)  {
 
-               $fieldArr['rl0'] = intval($rl[0]['uid']);
-               $fieldArr['rl1'] = intval($rl[1]['uid']);
-               $fieldArr['rl2'] = intval($rl[2]['uid']);
+                       // Consult relevant external document parser:
+               if (is_object($this->external_parsers[$ext]))   {
+                       $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
+               }
 
-               if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields']))    {
-                       foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel)  {
-                               $fieldArr[$fieldName] = intval($rl[$rootLineLevel]['uid']);
-                       }
+               return $contentArr;
+       }
+
+       /**
+        * Creates an array with pointers to divisions of document.
+        *
+        * @param       string          File extension
+        * @param       string          Absolute filename (must exist and be validated OK before calling function)
+        * @return      array           Array of pointers to sections that the document should be divided into
+        */
+       function fileContentParts($ext,$absFile)        {
+               $cParts = array(0);
+
+                       // Consult relevant external document parser:
+               if (is_object($this->external_parsers[$ext]))   {
+                       $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
                }
+
+               return $cParts;
        }
 
+       /**
+        * Splits non-HTML content (from external files for instance)
+        *
+        * @param       string          Input content (non-HTML) to index.
+        * @return      array           Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
+        * @see splitHTMLContent()
+        */
+       function splitRegularContent($content) {
+               $contentArr = $this->defaultContentArray;
+               $contentArr['body'] = $content;
 
+               return $contentArr;
+       }
 
 
 
@@ -1027,217 +1051,166 @@ class tx_indexedsearch_indexer {
 
 
 
-       /********************************
+
+
+       /**********************************
         *
-        * SQL Helper functions
+        * Analysing content, Extracting words
         *
-        *******************************/
+        **********************************/
 
        /**
-        * Removes ALL data regarding a certain indexed phash-row
+        * Convert character set and HTML entities in the value of input content array keys
         *
-        * @param       [type]          $phashList: ...
-        * @param       [type]          $clearPageCache: ...
-        * @return      [type]          ...
+        * @param       array           Standard content array
+        * @param       string          Charset of the input content (converted to utf-8)
+        * @return      void
         */
-       function removeIndexedPhashRow($phashList,$clearPageCache=1)    {
-               $phashRows=t3lib_div::trimExplode(',',$phashList,1);
-               while(list(,$phash)=each($phashRows))   {
-                       $phash = intval($phash);
-                       if ($phash>0)   {
-
-                               if ($clearPageCache)    {
-                                               // Clearing page cache:
-                                       $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('page_id', 'index_section', 'phash='.intval($phash));
-                                       if ($GLOBALS['TYPO3_DB']->sql_num_rows($res))   {
-                                               $idList = array();
-                                               while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))       {
-                                                       $idList[] = $row['page_id'];
-                                               }
-                                               $GLOBALS['TYPO3_DB']->exec_DELETEquery('cache_pages', 'page_id IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($idList)).')');
-                                       }
-                               }
+       function charsetEntity2utf8(&$contentArr, $charset)     {
 
-                                       // Removing old registrations for all tables.
-                               $tableArr = explode(',','index_phash,index_rel,index_section,index_fulltext,index_grlist');
-                               foreach($tableArr as $table)    {
-                                       $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, $table).'"');
+                       // Convert charset if necessary
+               reset($contentArr);
+               while(list($key,)=each($contentArr)) {
+                       if (strlen($contentArr[$key]))  {
+                               if ($charset!=='utf-8') {
+                                       $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
                                }
 
-                                       // Did not remove any index_section records for external files where phash_t3 points to this hash!
-#debug('DELETE: '.$phash,1);
+                                       // decode all numeric / html-entitiesin in the string to real characters:
+                               $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
                        }
                }
        }
 
        /**
-        * Check the mtime / tstamp of the currently indexed page/file (based on phash)
-        * Return positive integer if the page needs to being indexed!
+        * Processing words in the array from split*Content -functions
         *
-        * @param       integer         mtime value to test against limits and indexed page.
-        * @param       integer         Maximum age in seconds.
-        * @param       integer         Minimum age in seconds.
-        * @param       integer         "phash" used to select any already indexed page to see what its mtime is.
-        * @return      integer         Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceed and so indexing cannot occur.  -1) Mtimes matched so no need to reindex page. 0) N/A   1) Max age exceeded, page must be indexed again.   2) mtime of indexed page doesn't match mtime given for current content and we must index page.  3) No mtime was set, so we will index...  4) No indexed page found, so of course we will index.
+        * @param       array           Array of content to index, see splitHTMLContent() and splitRegularContent()
+        * @return      array           Content input array modified so each key is not a unique array of words
         */
-       function checkMtimeTstamp($mtime,$maxAge,$minAge,$phash)        {
+       function procesWordsInArrays($contentArr)       {
 
-                       // Select indexed page:
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
-               $out = 0;
+                       // split all parts to words
+               reset($contentArr);
+               while(list($key,)=each($contentArr)) {
+                       $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
+               }
 
-                       // If there was an indexing of the page...:
-               if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
-                       if ($maxAge && ($row['tstamp']+$maxAge)<time()) {               // If min age is exceeded, index the page
-                               $out = 1;
-                       } else {
-                               if (!$minAge || ($row['tstamp']+$minAge)<time())        {       // if minAge is not set or if minAge is exceeded, consider at mtime
-                                       if ($mtime)     {               // It mtime is set, then it's tested. If not, the page must clearly be indexed.
-                                               if ($row['item_mtime'] != $mtime)       {       // And if mtime is different from the index_phash mtime, it's about time to re-index.
-                                                       $out = 2;
-                                               } else {
-                                                       $out = -1;
-                                                       $this->updateTstamp($phash);    // Update the timestatmp
-                                                       $GLOBALS['TT']->setTSlogMessage('Mtime matched, timestamp updated.',1);
-                                               }
-                                       } else {$out = 3;       }
-                               } else {$out = -2;}
-                       }
-               } else {$out = 4;}      // No indexing found.
-               return $out;
+                       // For title, keywords, and description we don't want duplicates:
+               $contentArr['title'] = array_unique($contentArr['title']);
+               $contentArr['keywords'] = array_unique($contentArr['keywords']);
+               $contentArr['description'] = array_unique($contentArr['description']);
+
+                       // Return modified array:
+               return $contentArr;
        }
 
        /**
-        * Check if an grlist-entry for this hash exists and if not so, write one.
+        * Extracts the sample description text from the content array.
         *
-        * @param       [type]          $phash: ...
-        * @param       [type]          $phash_x: ...
-        * @return      [type]          ...
+        * @param       array           Content array
+        * @return      string          Description string
         */
-       function update_grlist($phash,$phash_x) {
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->pObj->gr_list));
-               if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
-                       $this->submit_grlist($phash,$phash_x);
-                       $GLOBALS['TT']->setTSlogMessage("Inserted gr_list '".$this->pObj->gr_list."' for phash '".$phash."'",1);
-               }
-       }
+       function bodyDescription($contentArr)   {
 
-       /**
-        * @param       [type]          $phash_x: ...
-        * @return      [type]          ...
-        */
-       function is_grlist_set($phash_x)        {
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
-               return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
-       }
+                       // Setting description
+               $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
+               if ($maxL)      {
+                               // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
+                       $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
 
-       /**
-        * Check content hash
-        * Returns true if the page needs to be indexed (that is, there was no result)
-        *
-        * @return      [type]          ...
-        */
-       function checkContentHash()     {
-                       // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash AS A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
-               if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
-                       return $row;
+                               // Shorten the string:
+                       $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
                }
-               return 1;
+
+               return $bodyDescription;
        }
 
        /**
-        * Removes any indexed pages with userlogins which has the same contentHash
+        * Analyzes content to use for indexing,
         *
-        * @return      [type]          ...
+        * @param       array           Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
+        * @return      array           Index Array (whatever that is...)
         */
-       function removeLoginpagesWithContentHash()      {
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash AS A,index_grlist AS B', '
-                                       A.phash=B.phash
-                                       AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
-                                       AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
-                                       AND A.contentHash='.intval($this->content_md5h));
-               while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))       {
-                       $GLOBALS['TT']->setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
-                       $this->removeOldIndexedPages($row['phash']);
-               }
+       function indexAnalyze($content) {
+               $indexArr = Array();
+               $counter = 0;
+
+               $this->analyzeHeaderinfo($indexArr,$content,'title',7);
+               $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
+               $this->analyzeHeaderinfo($indexArr,$content,'description',5);
+               $this->analyzeBody($indexArr,$content);
+
+               return ($indexArr);
        }
 
        /**
-        * Removes records for the indexed page, $phash
+        * Calculates relevant information for headercontent
         *
-        * @param       [type]          $phash: ...
-        * @return      [type]          ...
+        * @param       array           Index array, passed by reference
+        * @param       array           Standard content array
+        * @param       string          Key from standard content array
+        * @param       integer         Bit-wise priority to type
+        * @return      void
         */
-       function removeOldIndexedPages($phash)  {
-                       // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
-               $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext');
-               foreach($tableArr as $table)    {
-                       $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, $table).'"');
+       function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
+               reset($content[$key]);
+               while(list(,$val)=each($content[$key]))  {
+                       $val = substr($val,0,60);       // Max 60 - because the baseword varchar IS 60. This MUST be the same.
+                       $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
+                       $retArr[$val]['count'] = $retArr[$val]['count']+1;
+                       $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
+                       $retArr[$val]['metaphone'] = $this->metaphone($val);
+                       $this->wordcount++;
                }
-                       // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
-               $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, 'index_section').'"');
        }
 
        /**
-        * Check content hash for external documents
-        * Returns true if the document needs to be indexed (that is, there was no result)
+        * Calculates relevant information for bodycontent
         *
-        * @param       [type]          $hashGr: ...
-        * @param       [type]          $content_md5h: ...
-        * @return      [type]          ...
+        * @param       array           Index array, passed by reference
+        * @param       array           Standard content array
+        * @return      void
         */
-       function checkExternalDocContentHash($hashGr,$content_md5h)     {
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash AS A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
-               if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
-                       return 0;
+       function analyzeBody(&$retArr,$content) {
+               foreach($content['body'] as $key => $val)       {
+                       $val = substr($val,0,60);       // Max 60 - because the baseword varchar IS 60. This MUST be the same.
+                       if(!isset($retArr[$val])) {
+                               $retArr[$val]['first'] = $key;
+                               $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
+                               $retArr[$val]['metaphone'] = $this->metaphone($val);
+                       }
+                       $retArr[$val]['count'] = $retArr[$val]['count']+1;
+                       $this->wordcount++;
                }
-               return 1;
        }
 
        /**
-        * Update tstamp
+        * Creating metaphone based hash from input word
         *
-        * @param       [type]          $phash: ...
-        * @param       [type]          $mtime: ...
-        * @return      [type]          ...
+        * @param       string          Word to convert
+        * @param       boolean         If set, returns the raw metaphone value (not hashed)
+        * @return      mixed           Metaphone hash integer (or raw value, string)
         */
-       function updateTstamp($phash,$mtime=0)  {
-               $updateFields = array(
-                       'tstamp' => time()
-               );
-               if ($mtime)     { $updateFields['item_mtime'] = intval($mtime); }
+       function metaphone($word,$retRaw=FALSE) {
 
-               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
-       }
+               if (is_object($this->metaphoneObj))     {
+                       $tmp = $this->metaphoneObj->metaphone($word);
+               } else {
+                       $tmp = metaphone($word);
+               }
 
-       /**
-        * Update parsetime
-        *
-        * @param       [type]          $phash: ...
-        * @param       [type]          $parsetime: ...
-        * @return      [type]          ...
-        */
-       function updateParsetime($phash,$parsetime)     {
-               $updateFields = array(
-                       'parsetime' => intval($parsetime)
-               );
+                       // Return raw value?
+               if ($retRaw)    return $tmp;
 
-               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
+                       // Otherwise create hash and return integer
+               if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
+               return $ret;
        }
 
-       /**
-        * Update section rootline for the page
-        *
-        * @return      [type]          ...
-        */
-       function updateRootline()       {
 
-               $updateFields = array();
-               $this->getRootLineFields($updateFields);
 
-               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->pObj->id), $updateFields);
-       }
+
 
 
 
@@ -1252,91 +1225,109 @@ class tx_indexedsearch_indexer {
 
        /********************************
         *
-        * SQL; Inserting in database
+        * SQL; TYPO3 Pages
         *
         *******************************/
 
        /**
-        * Updates db with information about the page
+        * Updates db with information about the page (TYPO3 page, not external media)
         *
-        * @return      [type]          ...
+        * @return      void
         */
        function submitPage()   {
+
+                       // Remove any current data for this phash:
                $this->removeOldIndexedPages($this->hash['phash']);
 
-                       // setting new
+                       // setting new phash_row
                $fields = array(
                        'phash' => $this->hash['phash'],
                        'phash_grouping' => $this->hash['phash_grouping'],
                        'cHashParams' => serialize($this->cHashParams),
                        'contentHash' => $this->content_md5h,
-                       'data_page_id' => $this->pObj->id,
-                       'data_page_reg1' => $this->pObj->page_cache_reg1,
-                       'data_page_type' => $this->pObj->type,
-                       'data_page_mp' => $this->pObj->MP,
-                       'gr_list' => $this->pObj->gr_list,
+                       'data_page_id' => $this->conf['id'],
+                       'data_page_reg1' => $this->conf['page_cache_reg1'],
+                       'data_page_type' => $this->conf['type'],
+                       'data_page_mp' => $this->conf['MP'],
+                       'gr_list' => $this->conf['gr_list'],
                        'item_type' => 0,       // TYPO3 page
                        'item_title' => $this->contentParts['title'],
                        'item_description' => $this->bodyDescription($this->contentParts),
-                       'item_mtime' => $this->mtime,
-                       'item_size' => strlen($this->pObj->content),
+                       'item_mtime' => $this->conf['mtime'],
+                       'item_size' => strlen($this->conf['content']),
                        'tstamp' => time(),
                        'crdate' => time(),
-                       'item_crdate' => $this->pObj->page['crdate'],   // Creation date of page
-                       'sys_language_uid' => $this->pObj->sys_language_uid     // Sys language uid of the page. Should reflect which language it DOES actually display!
+                       'item_crdate' => $this->conf['crdate'], // Creation date of page
+                       'sys_language_uid' => $this->conf['sys_language_uid'],  // Sys language uid of the page. Should reflect which language it DOES actually display!
+                       'externalUrl' => 0,
+                       'recordUid' => intval($this->conf['recordUid']),
+                       'freeIndexUid' => intval($this->conf['freeIndexUid']),
                );
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
 
-               // ************************
-               // PROCESSING index_section
-               // ************************
+                       // PROCESSING index_section
                $this->submit_section($this->hash['phash'],$this->hash['phash']);
 
-               // ************************
-               // PROCESSING index_grlist
-               // ************************
+                       // PROCESSING index_grlist
                $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
 
-               // ************************
-               // PROCESSING index_fulltext
-               // ************************
+                       // PROCESSING index_fulltext
                $fields = array(
                        'phash' => $this->hash['phash'],
-                       'fulltextdata' => implode($this->contentParts,' ')
+                       'fulltextdata' => implode(' ', $this->contentParts)
                );
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
+
+                       // PROCESSING index_debug
+               if ($this->indexerConfig['debugMode'])  {
+                       $fields = array(
+                               'phash' => $this->hash['phash'],
+                               'debuginfo' => serialize(array(
+                                               'cHashParams' => $this->cHashParams,
+                                               'external_parsers initialized' => array_keys($this->external_parsers),
+                                               'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
+                                               'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
+                                               'logs' => $this->internal_log,
+                                               'lexer' => $this->lexerObj->debugString,
+                                       ))
+                       );
+                       $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
+               }
        }
 
        /**
-        * Stores gr_list
+        * Stores gr_list in the database.
         *
-        * @param       [type]          $hash: ...
-        * @param       [type]          $phash_x: ...
-        * @return      [type]          ...
+        * @param       integer         Search result record phash
+        * @param       integer         Actual phash of current content
+        * @return      void
+        * @see update_grlist()
         */
        function submit_grlist($hash,$phash_x)  {
+
                        // Setting the gr_list record
                $fields = array(
                        'phash' => $hash,
                        'phash_x' => $phash_x,
-                       'hash_gr_list' => $this->md5inthash($this->pObj->gr_list),
-                       'gr_list' => $this->pObj->gr_list
+                       'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
+                       'gr_list' => $this->conf['gr_list']
                );
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
        }
 
        /**
         * Stores section
+        * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
         *
-        * @param       [type]          $hash: ...
-        * @param       [type]          $hash_t3: ...
-        * @return      [type]          ...
+        * @param       integer         phash of TYPO3 parent search result record
+        * @param       integer         phash of the file indexation search record
+        * @return      void
         */
        function submit_section($hash,$hash_t3) {
                $fields = array(
                        'phash' => $hash,
                        'phash_t3' => $hash_t3,
-                       'page_id' => intval($this->pObj->id)
+                       'page_id' => intval($this->conf['id'])
                );
 
                $this->getRootLineFields($fields);
@@ -1345,25 +1336,62 @@ class tx_indexedsearch_indexer {
        }
 
        /**
-        * Updates db with information about the file
+        * Removes records for the indexed page, $phash
         *
-        * @param       [type]          $hash: ...
-        * @param       [type]          $file: ...
-        * @param       [type]          $subinfo: ...
-        * @param       [type]          $ext: ...
-        * @param       [type]          $mtime: ...
-        * @param       [type]          $ctime: ...
-        * @param       [type]          $size: ...
-        * @param       [type]          $content_md5h: ...
-        * @param       [type]          $contentParts: ...
-        * @return      [type]          ...
+        * @param       integer         phash value to flush
+        * @return      void
         */
-       function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)      {
-                       // Removing old registrations for tables.
-               $tableArr = explode(',','index_phash,index_fulltext,index_grlist');
+       function removeOldIndexedPages($phash)  {
+                       // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
+               $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
                foreach($tableArr as $table)    {
-                       $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($hash['phash'], $table).'"');
+                       $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
                }
+                       // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
+               $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
+       }
+
+
+
+
+
+
+
+
+
+
+
+
+
+       /********************************
+        *
+        * SQL; External media
+        *
+        *******************************/
+
+
+       /**
+        * Updates db with information about the file
+        *
+        * @param       array           Array with phash and phash_grouping keys for file
+        * @param       string          File name
+        * @param       array           Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
+        * @param       string          File extension determining the type of media.
+        * @param       integer         Modification time of file.
+        * @param       integer         Creation time of file.
+        * @param       integer         Size of file in bytes
+        * @param       integer         Content HASH value.
+        * @param       array           Standard content array (using only title and body for a file)
+        * @return      void
+        */
+       function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)      {
+
+                       // Remove any current data for this phash:
+               $this->removeOldIndexedFiles($hash['phash']);
+
+                       // Split filename:
+               $fileParts = parse_url($file);
+
                        // setting new
                $fields = array(
                        'phash' => $hash['phash'],
@@ -1371,7 +1399,7 @@ class tx_indexedsearch_indexer {
                        'cHashParams' => serialize($subinfo),
                        'contentHash' => $content_md5h,
                        'data_filename' => $file,
-                       'item_type' => intval($this->Itypes[$ext]) ? intval($this->Itypes[$ext]) : -1,
+                       'item_type' => $ext,
                        'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
                        'item_description' => $this->bodyDescription($contentParts),
                        'item_mtime' => $mtime,
@@ -1379,32 +1407,44 @@ class tx_indexedsearch_indexer {
                        'item_crdate' => $ctime,
                        'tstamp' => time(),
                        'crdate' => time(),
-                       'gr_list' => $this->pObj->gr_list
+                       'gr_list' => $this->conf['gr_list'],
+                       'externalUrl' => $fileParts['scheme'] ? 1 : 0,
+                       'recordUid' => intval($this->conf['recordUid']),
+                       'freeIndexUid' => intval($this->conf['freeIndexUid']),
                );
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
 
-               // ************************
-               // PROCESSING index_fulltext
-               // ************************
+                       // PROCESSING index_fulltext
                $fields = array(
                        'phash' => $hash['phash'],
-                       'fulltextdata' => implode($contentParts,' ')
+                       'fulltextdata' => implode(' ', $contentParts)
                );
                $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
+
+                       // PROCESSING index_debug
+               if ($this->indexerConfig['debugMode'])  {
+                       $fields = array(
+                               'phash' => $hash['phash'],
+                               'debuginfo' => serialize(array(
+                                               'cHashParams' => $subinfo,
+                                               'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
+                                               'logs' => $this->internal_log,
+                                               'lexer' => $this->lexerObj->debugString,
+                                       ))
+                       );
+                       $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
+               }
        }
 
        /**
-        * Stores file gr_list for a file IF it does not exist
+        * Stores file gr_list for a file IF it does not exist already
         *
-        * @param       [type]          $hash: ...
-        * @return      [type]          ...
+        * @param       integer         phash value of file
+        * @return      void
         */
        function submitFile_grlist($hash)       {
-               // ************************
-               // PROCESSING index_grlist
-               // ************************
                        // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->pObj->gr_list).')');
+               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
                if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
                        $this->submit_grlist($hash,$hash);
                }
@@ -1413,30 +1453,256 @@ class tx_indexedsearch_indexer {
        /**
         * Stores file section for a file IF it does not exist
         *
-        * @param       [type]          $hash: ...
-        * @return      [type]          ...
+        * @param       integer         phash value of file
+        * @return      void
         */
        function submitFile_section($hash)      {
-               // ************************
-               // PROCESSING index_grlist
-               // ************************
-                       // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
-               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->pObj->id));
+                       // Testing if there is a section
+               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
                if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
                        $this->submit_section($hash,$this->hash['phash']);
                }
        }
 
        /**
+        * Removes records for the indexed page, $phash
+        *
+        * @param       integer         phash value to flush
+        * @return      void
+        */
+       function removeOldIndexedFiles($phash)  {
+
+                       // Removing old registrations for tables.
+               $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
+               foreach($tableArr as $table)    {
+                       $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
+               }
+       }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+       /********************************
+        *
+        * SQL Helper functions
+        *
+        *******************************/
+
+       /**
+        * Check the mtime / tstamp of the currently indexed page/file (based on phash)
+        * Return positive integer if the page needs to being indexed!
+        *
+        * @param       integer         mtime value to test against limits and indexed page.
+        * @param       integer         "phash" used to select any already indexed page to see what its mtime is.
+        * @return      integer         Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceed and so indexing cannot occur.  -1) Mtimes matched so no need to reindex page. 0) N/A   1) Max age exceeded, page must be indexed again.   2) mtime of indexed page doesn't match mtime given for current content and we must index page.  3) No mtime was set, so we will index...  4) No indexed page found, so of course we will index.
+        */
+       function checkMtimeTstamp($mtime,$phash)        {
+
+                       // Select indexed page:
+               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
+               $out = 0;
+
+                       // If there was an indexing of the page...:
+               if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
+                       if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time())     {               // If max age is exceeded, index the page
+                               $out = 1;               // The configured max-age was exceeded for the document and thus it's indexed.
+                       } else {
+                               if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time())      {       // if minAge is not set or if minAge is exceeded, consider at mtime
+                                       if ($mtime)     {               // It mtime is set, then it's tested. If not, the page must clearly be indexed.
+                                               if ($row['item_mtime'] != $mtime)       {       // And if mtime is different from the index_phash mtime, it's about time to re-index.
+                                                       $out = 2;               // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
+                                               } else {
+                                                       $out = -1;              // mtime matched the document, so no changes detected and no content updated
+                                                       if ($this->tstamp_maxAge)       {
+                                                               $this->log_setTSlogMessage('Mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
+                                                       } else {
+                                                               $this->updateTstamp($phash);    // Update the timestatmp
+                                                               $this->log_setTSlogMessage('Mtime matched, timestamp updated.',1);
+                                                       }
+                                               }
+                                       } else {$out = 3;       }       // The minimum age was exceed, but mtime was not set, so the page was indexed.
+                               } else {$out = -2;}                     // The minimum age was not exceeded
+                       }
+               } else {$out = 4;}      // Page has never been indexed (is not represented in the index_phash table).
+               return $out;
+       }
+
+       /**
+        * Check content hash in phash table
+        *
+        * @return      mixed           Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
+        */
+       function checkContentHash()     {
+                       // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
+               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash AS A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
+               if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
+                       return $row;
+               }
+               return 1;
+       }
+
+       /**
+        * Check content hash for external documents
+        * Returns true if the document needs to be indexed (that is, there was no result)
+        *
+        * @param       integer         phash value to check (phash_grouping)
+        * @param       integer         Content hash to check
+        * @return      boolean         Returns true if the document needs to be indexed (that is, there was no result)
+        */
+       function checkExternalDocContentHash($hashGr,$content_md5h)     {
+               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash AS A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
+               if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
+                       return 0;
+               }
+               return 1;
+       }
+
+       /**
+        * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
+        *
+        * @param       integer         Phash integer to test.
+        * @return      void
+        */
+       function is_grlist_set($phash_x)        {
+               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
+               return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
+       }
+
+       /**
+        * Check if an grlist-entry for this hash exists and if not so, write one.
+        *
+        * @param       integer         phash of the search result that should be found
+        * @param       integer         The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
+        * @return      void
+        * @see submit_grlist()
+        */
+       function update_grlist($phash,$phash_x) {
+               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
+               if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
+                       $this->submit_grlist($phash,$phash_x);
+                       $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
+               }
+       }
+
+       /**
+        * Update tstamp for a phash row.
+        *
+        * @param       integer         phash value
+        * @param       integer         If set, update the mtime field to this value.
+        * @return      void
+        */
+       function updateTstamp($phash,$mtime=0)  {
+               $updateFields = array(
+                       'tstamp' => time()
+               );
+               if ($mtime)     { $updateFields['item_mtime'] = intval($mtime); }
+
+               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
+       }
+
+       /**
+        * Update parsetime for phash row.
+        *
+        * @param       integer         phash value.
+        * @param       integer         Parsetime value to set.
+        * @return      void
+        */
+       function updateParsetime($phash,$parsetime)     {
+               $updateFields = array(
+                       'parsetime' => intval($parsetime)
+               );
+
+               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
+       }
+
+       /**
+        * Update section rootline for the page
+        *
+        * @return      void
+        */
+       function updateRootline()       {
+
+               $updateFields = array();
+               $this->getRootLineFields($updateFields);
+
+               $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
+       }
+
+       /**
+        * Adding values for root-line fields.
+        * rl0, rl1 and rl2 are standard. A hook might add more.
+        *
+        * @param       array           Field array, passed by reference
+        * @return      void
+        */
+       function getRootLineFields(&$fieldArr)  {
+
+               $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
+               $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
+               $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
+
+               if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields']))    {
+                       foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel)  {
+                               $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
+                       }
+               }
+       }
+
+       /**
+        * Removes any indexed pages with userlogins which has the same contentHash
+        * NOT USED anywhere inside this class!
+        *
+        * @return      void
+        */
+       function removeLoginpagesWithContentHash()      {
+               $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash AS A,index_grlist AS B', '
+                                       A.phash=B.phash
+                                       AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
+                                       AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
+                                       AND A.contentHash='.intval($this->content_md5h));
+               while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))       {
+                       $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
+                       $this->removeOldIndexedPages($row['phash']);
+               }
+       }
+
+
+
+
+
+
+
+
+
+
+
+
+       /********************************
+        *
+        * SQL; Submitting words
+        *
+        *******************************/
+
+       /**
         * Adds new words to db
         *
-        * @param       [type]          $wl: ...
-        * @return      [type]          ...
+        * @param       array           Word List array (where each word has information about position etc).
+        * @return      void
         */
        function checkWordList($wl) {
                reset($wl);
-               $phashArr=array();
-               while(list($key,)=each($wl)) {
+               $phashArr = array();
+               while(list($key,) = each($wl)) {
                        $phashArr[] = $wl[$key]['hash'];
                }
                if (count($phashArr))   {
@@ -1444,7 +1710,7 @@ class tx_indexedsearch_indexer {
                        $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
 
                        if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
-                               $GLOBALS['TT']->setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
+                               $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
                                while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
                                        unset($wl[$row['baseword']]);
                                }
@@ -1456,7 +1722,7 @@ class tx_indexedsearch_indexer {
                                                'baseword' => $key,
                                                'metaphone' => $val['metaphone']
                                        );
-                                               // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 30 chars (the baseword varchar is 30 characters...) this is not a problem.
+                                               // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
                                        $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
                                }
                        }
@@ -1464,14 +1730,14 @@ class tx_indexedsearch_indexer {
        }
 
        /**
-        * Submits information about words on the page to the db
+        * Submits RELATIONS between words and phash
         *
-        * @param       [type]          $wl: ...
-        * @param       [type]          $phash: ...
-        * @return      [type]          ...
+        * @param       array           Word list array
+        * @param       integer         phash value
+        * @return      void
         */
        function submitWords($wl,$phash) {
-               $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, 'index_rel').'"');
+               $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
 
                foreach($wl as $val)    {
                        $insertFields = array(
@@ -1480,18 +1746,31 @@ class tx_indexedsearch_indexer {
                                'count' => $val['count'],
                                'first' => $val['first'],
                                'freq' => $this->freqMap(($val['count']/$this->wordcount)),
-                               'flags' => $val['cmp']
+                               'flags' => ($val['cmp'] & $this->flagBitMask)
                        );
 
                        $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
                }
        }
 
+       /**
+        * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
+        * and back.
+        *
+        * @param       double          Frequency
+        * @return      integer         Frequency in range.
+        */
+       function freqMap($freq) {
+               $mapFactor = $this->freqMax*100*$this->freqRange;
+               if($freq<1) {
+                       $newFreq = $freq*$mapFactor;
+                       $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
+               } else {
+                       $newFreq = $freq/$mapFactor;
+               }
+               return $newFreq;
 
-
-
-
-
+       }
 
 
 
@@ -1512,30 +1791,33 @@ class tx_indexedsearch_indexer {
        /**
         * Get search hash, T3 pages
         *
-        * @return      [type]          ...
+        * @return      void
         */
        function setT3Hashes()  {
+
                        //  Set main array:
                $hArray = array(
-                       'id' => $this->pObj->id,
-                       'type' => $this->pObj->type,
-                       'sys_lang' => $this->pObj->sys_language_uid,
-                       'MP' => $this->pObj->MP,
+                       'id' => (integer)$this->conf['id'],
+                       'type' => (integer)$this->conf['type'],
+                       'sys_lang' => (integer)$this->conf['sys_language_uid'],
+                       'MP' => (string)$this->conf['MP'],
                        'cHash' => $this->cHashParams
                );
-                       // Set grouping hash:
+
+                       // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
                $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
-                       // Add gr_list and set plain phash
-               $hArray['gr_list']=$this->pObj->gr_list;
+
+                       // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
+               $hArray['gr_list'] = (string)$this->conf['gr_list'];
                $this->hash['phash'] = $this->md5inthash(serialize($hArray));
        }
 
        /**
         * Get search hash, external files
         *
-        * @param       [type]          $file: ...
-        * @param       [type]          $subinfo: ...
-        * @return      [type]          ...
+        * @param       string          File name / path which identifies it on the server
+        * @param       array           Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
+        * @return      array           Array with "phash_grouping" and "phash" inside.
         */
        function setExtHashes($file,$subinfo=array())   {
                        //  Set main array:
@@ -1556,15 +1838,87 @@ class tx_indexedsearch_indexer {
 
        /**
         * md5 integer hash
+        * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
         *
-        * @param       [type]          $str: ...
-        * @return      [type]          ...
+        * @param       string          String to hash
+        * @return      integer         Integer intepretation of the md5 hash of input string.
         */
        function md5inthash($str)       {
-                       // Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they does not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
-                       // NOTICE: This must be changed a number of other places as well!
                return hexdec(substr(md5($str),0,7));
        }
+
+       /**
+        * Calculates the cHash value of input GET array (for constructing cHash values if needed)
+        *
+        * @param       array           Array of GET parameters to encode
+        * @return      void
+        */
+       function makeCHash($paramArray) {
+               $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
+               $params = explode('&',substr($addQueryParams,1));       // Splitting parameters up
+
+                       // Make array:
+               $pA = array();
+               foreach($params as $theP)       {
+                       $pKV = explode('=', $theP);     // SPlitting single param by '=' sign
+                       if (!t3lib_div::inList('id,type,no_cache,cHash,MP,ftu',$pKV[0]))        {
+                               $pA[$pKV[0]] = (string)rawurldecode($pKV[1]);
+                       }
+               }
+               $pA['encryptionKey'] = $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'];
+               ksort($pA);
+
+               return t3lib_div::shortMD5(serialize($pA));
+       }
+
+
+
+
+
+
+
+
+
+
+
+
+       /*********************************
+        *
+        * Internal logging functions
+        *
+        *********************************/
+
+       /**
+        * Push function wrapper for TT logging
+        *
+        * @param       string          Title to set
+        * @param       string          Key (?)
+        * @return      void
+        */
+       function log_push($msg,$key)    {
+               if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->push($msg,$key);
+       }
+
+       /**
+        * Pull function wrapper for TT logging
+        *
+        * @return      void
+        */
+       function log_pull()     {
+               if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->pull();
+       }
+
+       /**
+        * Set log message function wrapper for TT logging
+        *
+        * @param       string          Message to set
+        * @param       integer         Error number
+        * @return      void
+        */
+       function log_setTSlogMessage($msg, $errorNum=0) {
+               if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
+               $this->internal_log[] = $msg;
+       }
 }
 
 
diff --git a/typo3/sysext/indexed_search/class.lexer.php b/typo3/sysext/indexed_search/class.lexer.php
new file mode 100755 (executable)
index 0000000..a3e04dc
--- /dev/null
@@ -0,0 +1,305 @@
+<?php
+/***************************************************************
+*  Copyright notice
+*
+*  (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
+*  All rights reserved
+*
+*  This script is part of the TYPO3 project. The TYPO3 project is
+*  free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2 of the License, or
+*  (at your option) any later version.
+*
+*  The GNU General Public License can be found at
+*  http://www.gnu.org/copyleft/gpl.html.
+*  A copy is found in the textfile GPL.txt and important notices to the license
+*  from the author is found in LICENSE.txt distributed with these scripts.
+*
+*
+*  This script is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*
+*  This copyright notice MUST APPEAR in all copies of the script!
+***************************************************************/
+/**
+ * Lexer for indexed_search
+ *
+ * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
+ * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
+ */
+/**
+ * [CLASS/FUNCTION INDEX of SCRIPT]
+ *
+ *
+ *
+ *   91: class tx_indexedsearch_lexer
+ *  105:     function tx_indexedsearch_lexer()
+ *  117:     function split2Words($wordString)
+ *
+ *              SECTION: Helper functions
+ *  176:     function utf8_ord(&$str, &$len, $pos=0, $hex=false)
+ *  201:     function utf8_is_letter(&$str, &$len, $pos=0, $scan=false)
+ *  284:     function get_word($charset, &$str, $pos=0)
+ *
+ * TOTAL FUNCTIONS: 5
+ * (This index is automatically created/updated by the extension "extdeveval")
+ *
+ */
+
+
+
+/*
+
+DESCRIPTION OF (CJK) ALGORITHM
+
+  Continuous letters and numbers make up words.  Spaces and symbols
+  separate letters and numbers into words.  This is sufficient for
+  all western text.
+
+  CJK doesn't use spaces or separators to separate words, so the only
+  way to really find out what constitutes a word would be to have a
+  dictionary and advanced heuristics.  Instead, we form pairs from
+  consecutive characters, in such a way that searches will find only
+  characters that appear more-or-less the right sequence.  For example:
+
+    ABCDE => AB BC CD DE
+
+  This works okay since both the index and the search query is split
+  in the same manner, and since the set of characters is huge so the
+  extra matches are not significant.
+
+*/
+
+
+
+
+
+
+
+
+/**
+ * Lexer class for indexed_search
+ * A lexer splits the text into words
+ *
+ * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
+ * @package TYPO3
+ * @subpackage tx_indexedsearch
+ */
+class tx_indexedsearch_lexer {
+
+       var $debug = FALSE;
+       var $debugString = '';
+
+       var $csObj;             // Charset class object , t3lib_cs
+
+
+
+       /**
+        * Constructor: Initializes the charset class, t3lib_cs
+        *
+        * @return      void
+        */
+       function tx_indexedsearch_lexer() {
+               $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
+       }
+
+
+       /**
+        * Splitting string into words.
+        * Used for indexing, can also be used to find words in query.
+        *
+        * @param       string          String with UTF-8 content to process.
+        * @return      array           Array of words in utf-8
+        */
+       function split2Words($wordString)       {
+
+                       // Reset debug string:
+               $this->debugString = '';
+
+                       // Then convert the string to lowercase:
+               $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
+
+                       // Now, splitting words:
+               $len = 0;
+               $start = 0;
+               $pos = 0;
+               $words = array();
+               $this->debugString = '';
+
+               while(1)        {
+                       list($start,$len) = $this->get_word('utf-8', $wordString, $pos);
+                       if ($len)       {
+                               $words[] = substr($wordString,$start,$len);
+
+                               if ($this->debug)       {
+                                       $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.htmlspecialchars(substr($wordString,$start,$len));
+                               }
+
+                               $pos = $start+$len;
+                       } else break;
+               }
+
+               return $words;
+       }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+       /************************************
+        *
+        * Helper functions
+        *
+        ************************************/
+
+       /**
+        * Converts a UTF-8 multibyte character to a UNICODE codepoint
+        *
+        * @param       string          UTF-8 multibyte character string (reference)
+        * @param       integer         The length of the character (reference, return value)
+        * @param       integer         Starting position in input string
+        * @param       boolean         If set, then a hex. number is returned
+        * @return      integer         UNICODE codepoint
+        */
+       function utf8_ord(&$str, &$len, $pos=0, $hex=false)     {
+               $ord = ord($str{$pos});
+               $len = 1;
+
+               if ($ord > 0x80)        {
+                       for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of extra bytes
+                       $len += $bc;
+
+                       $ord = $ord & ((1 << (6-$bc)) - 1);     // mask utf-8 lead-in bytes
+                       for ($i=$pos+1; $bc; $bc--, $i++)       // "bring in" data bytes
+                               $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
+               }
+
+               return $hex ? 'x'.dechex($ord) : $ord;
+       }
+
+       /**
+        * See if a character is a letter (or a string of letters or non-letters).
+        *
+        * @param       string          Input string (reference)
+        * @param       integer         Byte-length of character sequence (reference, return value)
+        * @param       integer         Starting position in input string
+        * @param       boolean         If set will scan for a whole sequence of characters
+        * @return      boolean         letter (or word) found
+        */
+       function utf8_is_letter(&$str, &$len, $pos=0, $scan=false)      {
+               global $cs;
+
+               $len = 0;
+               $bc = 0;
+               $found = false; // found a letter
+               $letter = true; // looking for a letter?
+
+               if ($str{$pos} == '')   return false;
+
+               while(1) {
+                       if ($len)       {
+                               if ($scan)      {
+                                       if ($letter && !$found) {       // end of word reached
+                                               return true;
+                                       }
+                                       elseif (!$letter && $found)     {       // end of non-word reached
+                                               return false;
+                                       }
+                               }
+                               else    {
+                                       return $found;  // report single letter status
+                               }
+                       }
+                       $len += $bc;    // add byte-length of last found character
+                       $found = false;
+
+                       if ($str{$pos} == '')   return $letter; // end of string
+
+                       $cp = $this->utf8_ord($str,$bc,$pos);
+                       $pos += $bc;
+
+                       if ($cp >= 0x41 && $cp <= 0x5A ||       // Basic Latin: capital letters
+                           $cp >= 0x30 && $cp <= 0x39 ||       // Numbers
+                               $cp >= 0x61 && $cp <= 0x7A)     {       //              small letters
+                               $found = true;
+                               continue;
+                       }
+
+                       if ($cp >= 0xC0 && $cp <= 0xFF) {       // Latin-1 Supplement (0x80-0xFF)
+                               // 0x80-0x9F are unassigned
+                               // 0xA0-0xBF are non-letters
+
+                               if ($cp != 0xD7 && $cp != 0xF7) {       // multiplication and division sign
+                                       $found = true;
+                                       continue;
+                               }
+                       } elseif ($cp >= 0x100 && $cp < 0x280)  {       // Latin Extended-A and -B
+                               $found = true;
+                               continue;
+                       } elseif ($cp >= 0x370 && $cp < 0x400)  {       // Greek and Coptic
+                               $found = true;
+                               continue;
+                       } elseif ($cp >= 0x400 && $cp < 0x530)  {       // Cyrillic and Cyrillic Supplement
+                               $found = true;
+                               continue;
+                       } elseif ($cp >= 0x590 && $cp < 0x600)  {       // Hebrew
+                               $found = true;
+                               continue;
+                       } elseif ($cp >= 0x600 && $cp < 0x700)  {       // Arabic
+                               $found = true;
+                               continue;
+                       }
+                               // I dont't think we need to support these:
+                               //  Latin Extended Additional
+                               //  Greek Extended
+                               //  Alphabetic Presentation Forms
+                               //  Arabic Presentation Forms-A
+                               //  Arabic Presentation Forms-B
+
+                       if (!$len)      $letter = false;
+               }
+
+               return false;
+       }
+
+       /**
+        * Get the first word in a given string (initial non-letters will be skipped)
+        *
+        * @param       string          The charset
+        * @param       string          Input string (reference)
+        * @param       integer         Starting position in input string
+        * @return      array           0: start, 1: len or false if no word has been found
+        */
+       function get_word($charset, &$str, $pos=0)      {
+               if ($charset == 'utf-8')        {
+                       $letters = $this->utf8_is_letter($str, $len, $pos, true);
+                       if ($letters)   return array($pos,$len);        // word found
+
+                       $pos += $len;
+                       if ($str{$pos} == '')   return false;   // end of string
+
+                       $this->utf8_is_letter($str, $len, $pos, true);
+                       return array($pos,$len);
+               }
+
+               return false;
+       }
+}
+
+
+if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php'])    {
+    include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
+}
+?>
\ No newline at end of file
diff --git a/typo3/sysext/indexed_search/cli/conf.php b/typo3/sysext/indexed_search/cli/conf.php
new file mode 100644 (file)
index 0000000..00f0f23
--- /dev/null
@@ -0,0 +1,8 @@
+<?php
+
+// DO NOT REMOVE OR CHANGE THESE 3 LINES:
+define('TYPO3_MOD_PATH', 'sysext/indexed_search/cli/');
+$BACK_PATH = '../../../';
+$MCONF['name'] = '_CLI_indexedsearch';
+
+?>
diff --git a/typo3/sysext/indexed_search/cli/indexer_cli.phpsh b/typo3/sysext/indexed_search/cli/indexer_cli.phpsh
new file mode 100755 (executable)
index 0000000..f828a03
--- /dev/null
@@ -0,0 +1,25 @@
+#! /usr/bin/php -q
+<?php
+
+// *****************************************
+// Standard initialization of a CLI module:
+// *****************************************
+
+       // Defining circumstances for CLI mode:
+define('TYPO3_cliMode', TRUE);
+
+       // Defining PATH_thisScript here: Must be the ABSOLUTE path of this script in the right context:
+       // This will work as long as the script is called by it's absolute path!
+define(PATH_thisScript,$HTTP_ENV_VARS['_']);
+
+       // Include configuration file:
+require(dirname(PATH_thisScript).'/conf.php');
+
+       // Include init file:
+require(dirname(PATH_thisScript).'/'.$BACK_PATH.'init.php');
+
+
+
+# HERE you run your application!
+
+?>
index 718344a..845a81c 100755 (executable)
@@ -1,33 +1,55 @@
-- add searching in additional individual tables - non-indexed though (just comfortably "all-in-one" search)
-- add API for indexing a string which is passed by an extension (eg. the bodytext of a news-plugin). 
-  - Attached to this string might be information of which link could display it. For instance, it could be the page id + which parameter string to append.
-  - API for searching in this indexed content ONLY -> in other words an advanced search engine for records.
-- add possibility of cron-job based crawler-indexing of any external site (based on configuration record in the page tree. Access to that page will determin whether external URL is part of result. Just like the external media is.)
-- add possibility of indexing off non-cached page content which is re-indexed based on a time-interval.
-- flags i pages tabel: set_for_indexing (reset when indexed), do not index, ...
+- Improve lexer:
+       - See BASIC_LEXER from Oracle
+       - CJK hack from Zope.
+       - Test
+       - Implement in search query analysis.
+- TESTING with russian, danish, chinese, japanese etc...
+- CVS
+
+**************'
+
+- Index Configurations / CLI indexer
+- Proper skinning? / getLL? / XHTML
+-------------------------------
+
+CRAWLER:
+&L=[_TABLE:sys_language;_PID:0:_tx_indexedsearch_fields:bodytext,header]
+&[_LOGIN]=[,kasper,francis;_PID:]
+&myext[uid]=[1-34,35,36-10]&another=1?
+(cache mgm / crawler / publishing)
+-> parameters "_TABLE" can instruct indexed_Search which table to index records from!
+-> default is flush, INHERIT can
+-> crawler gets result back?
+       - cached?
+       - parsetime?
+       - strlen?
+
+
+
+
+************************************************************
+
+Templating / Display in plugin:
 - Localization, configuration of search-options, stylesheet formatting of result content (with new CSS Stylesheet Editor)
-- Templating with Template Voila TOs?
-- Metaphone algorithm which works with foreign languages (Double-metaphone function? Rene suggested...)
-- More helperapps for RTF, SXW, Excel, ?
-- Faster linkPage() function
+- Templating
+       - with Template Voila TOs?
+       - other approaches? (markers seem straight forward)
+- linkPage() function
+       - Link correctly to MP links / external documents?
+       - Link correctly to external documents / URLs?
 - CHECK: Which keys are necessary??? There are four keys on typo3.com. Are they all used by the indexer or what?
-- Add support for windows versions of PDF/CATdoc (Message-ID: <6B9653C74CC1D41199CF00508BFCA65D90A7E0@nt-server4.tab.lan>). Notice: Catdoc only takes 8.3 filenames.
-- Indexing content from other charsets (probably this is about splitting the local chars, currently only danish/german) Message-ID: <006f01c325ca$b39eb4d0$0100a8c0@FORTRESS>
 - Is result links working for frames? (&type=1) See Message-ID: <3DA762A0.84BDA4F1@kuehn.com>
+- Implement Stop-words in search
 
-- DOC: Tutorial on setting it up, getting it to run, trouble shoot it.
-
-Also see:
-- [Kasper] "EXT:indexed_search" folder in Mailbox 
+Clean up backend modules:
+       - getLL()
+       - skinning()
+       - XHTML()
 
+Misc:
+- DOC: Tutorial on setting it up, getting it to run, trouble shoot it.
+- add possibility of cron-job based crawler-indexing of any external site (based on configuration record in the page tree. Access to that page will determine whether external URL is part of result. Just like the external media is.)
+- flags i pages tabel: set_for_indexing (reset when indexed), do not index, ...
+- The Tools>Indexing module could need some shining up and more useful features (Someone else does this?)
+- CLI til removal of old indexes: First set flag, then 14 days later remove the records.
 
-__OLD list:
-- OK, Metaphone value > integer, signed
-- OK, strtolower of content does not proces eg. ÆØÅ correctly. What to do?
-- OK splitting i ord tager ikke hensyn til så som email-adresser, danske bogstaver!
-- OK Teste mtime før indexering
-- OK if mtime/contentHash return ok, update tstamp!
-- OK(?) - sections
-- OK- pdf-files subpages.
-- OK - sections - hvad sker der, når sider flyttes rundt.
-- OK - Jumpurl for indexering af links.
\ No newline at end of file
index 86ee857..2e70e73 100755 (executable)
Binary files a/typo3/sysext/indexed_search/doc/manual.sxw and b/typo3/sysext/indexed_search/doc/manual.sxw differ
index 958b4b0..af46c1f 100755 (executable)
@@ -1,9 +1,50 @@
   # cat=basic; type=string; label=Path to PDF parsers: The indexer uses the applications 'pdftotext' and 'pdfinfo' for extracting content from PDF files. You must install these applications in this path. Otherwise leave the field empty.
 pdftools = /usr/local/bin/
 
-  # cat=basic; type=string; label=Path to WORD parsers: The indexer uses the application 'catdoc' for extracting content from WORD files. You must install this application in this path. Otherwise leave the field empty.
+  # cat=basic; type=int; label=PDF parsing mode: Zero=whole file is indexed in one. Positive value: Indicates number of pages at a time, eg. "5" would mean 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10. Range is -100 to 100.
+pdf_mode = 20
+
+  # cat=basic; type=boolean; label = Native OpenOffice support: Use the extension "libunzipped" to extract Open Office files instead of the external program "ruby" / "ooo_extract.rb"
+nativeOOMethod = 0
+
+  # cat=basic; type=string; label=Path to OOo parser: The indexer uses the application 'ooo_extract' for extracting content from OpenOffice files (unless "[nativeOOMethod]" above is set). You must install ooo_extract in this path. Otherwise leave the field empty.
+OOoExtract = /usr/local/bin/
+
+  # cat=basic; type=string; label=Path to RUBY: The indexer uses the application 'ruby' for extracting content from OpenOffice files (unless "[nativeOOMethod]" above is set). You must install this application in this path. Otherwise leave the field empty.
+ruby = /usr/local/bin/
+
+  # cat=basic; type=string; label=Path to WORD parser: The indexer uses the application 'catdoc' for extracting content from WORD files. You must install this application in this path. Otherwise leave the field empty.
 catdoc = /usr/local/bin/
 
-  # cat=basic; type=int; label=PDF parsing mode: Zero=whole file is indexed in one. Positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10. Range is -100 to 100.
-pdf_mode = -20
+  # cat=basic; type=string; label=Path to EXCEL parser: The indexer uses the application 'xlhtml' for extracting content from EXCEL files. You must install this application in this path. Otherwise leave the field empty.
+xlhtml = /usr/local/bin/
+
+  # cat=basic; type=string; label=Path to POWERPOINT parser: The indexer uses the application 'ppthtml' for extracting content from POWERPOINT files. You must install this application in this path. Otherwise leave the field empty.
+ppthtml = /usr/local/bin/
+
+  # cat=basic; type=string; label=Path to RTF parser: The indexer uses the application 'unrtf' for extracting content from RTF files. You must install this application in this path. Otherwise leave the field empty.
+unrtf = /usr/local/bin/
+
+   # cat=basic; type=boolean; label=Debug mode: If set, debugging information is collected during indexing and can be shown in the backend. Only for debugging since it slows down the system and fills the database with crap.
+debugMode = 0
+
+  # cat=basic; type=boolean; label=Disable Indexing in Frontend: By default pages are indexed during viewing of pages in the frontend. You can disable this features so indexing of pages is only initiated through the backend page crawler.
+disableFrontendIndexing = 0
+
+  # cat=basic; type=int; label=Min TTL (hours) for indexed page: The time in hours that must pass before an indexed page can be indexed again regardless of changes on the page.
+minAge = 24
+
+  # cat=basic; type=int; label=Max TTL (hours) for indexed page: The time in hours that is the maximum age of an indexed page before it will get indexed again.
+maxAge = 168
+
+  # cat=basic; type=int; label=Max external files to index: When external files are found on a page this number indicates how many may be indexed at that point. This prevents pages with links to many external documents to take the server down - but it may also prevent documents from being indexed.
+maxExternalFiles = 5
+
+  # cat=basic; type=int; label=Bitmask for Flags (Advanced): By this value (0-255) you can filter the importance of <title> (128), <keywords> (64) and <description> (32) content from HTML documents. By default none of these will have any importance over the other. Setting the value to eg. 196 means that title-tag content and meta-keywords will be flagged (and rate higher in search results)
+flagBitMask = 196
+
+  # cat=basic; type=string; label=Ignore Extensions: List of file extensions that the external parser will ignore (despite having support for them). Comma list.
+ignoreExtensions =
 
+  # cat=basic; type=boolean; label=Index External HTML URLs: If set, links to external URLs will be indexed if they are of type "text/html".
+indexExternalURLs = 0
index 109180d..5e2b2a6 100755 (executable)
@@ -2,9 +2,9 @@
 
 ########################################################################
 # Extension Manager/Repository config file for ext: "indexed_search"
-# 
+#
 # Auto generated 17-11-2004 17:48
-# 
+#
 # Manual updates:
 # Only the data in the array - anything else is removed by next write
 ########################################################################
@@ -20,7 +20,7 @@ $EM_CONF[$_EXTKEY] = Array (
        'loadOrder' => '',
        'TYPO3_version' => '3.7.0-',
        'PHP_version' => '0.0.1-0.0.1',
-       'module' => 'mod',
+       'module' => 'mod,cli',
        'state' => 'stable',
        'internal' => 1,
        'uploadfolder' => 0,
index 098ebe9..d771578 100755 (executable)
@@ -9,4 +9,27 @@ t3lib_extMgm::addTypoScript($_EXTKEY,'editorcfg','
 
 
 $TYPO3_CONF_VARS['SC_OPTIONS']['tslib/class.tslib_fe.php']['pageIndexing'][] = 'EXT:indexed_search/class.indexer.php:tx_indexedsearch_indexer';
+
+
+       // Configure default document parsers:
+$TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] = array(
+       'pdf' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'doc' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'pps' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'ppt' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'xls' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'sxc' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'sxi' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'sxw' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'rtf' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'txt' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'html' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'htm' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'csv' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'xml' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'jpg' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'jpeg' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+       'tif' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
+);
+
 ?>
\ No newline at end of file
index 8d06e84..4599750 100755 (executable)
@@ -6,11 +6,34 @@ if (TYPO3_MODE=='BE') t3lib_extMgm::addModule('tools','isearch','after:log',t3li
 
 if (TYPO3_MODE=='BE')    {
     t3lib_extMgm::insertModuleFunction(
-        'web_info',        
+        'web_info',
         'tx_indexedsearch_modfunc1',
         t3lib_extMgm::extPath($_EXTKEY).'modfunc1/class.tx_indexedsearch_modfunc1.php',
         'LLL:EXT:indexed_search/locallang.php:mod_indexed_search'
     );
 }
 
+t3lib_extMgm::allowTableOnStandardPages('index_config');
+
+$TCA['index_config'] = Array (
+    'ctrl' => Array (
+        'title' => 'LLL:EXT:indexed_search/locallang_db.php:index_config',
+        'label' => 'title',
+        'tstamp' => 'tstamp',
+        'crdate' => 'crdate',
+        'cruser_id' => 'cruser_id',
+        'type' => 'type',
+        'default_sortby' => 'ORDER BY crdate',
+        'enablecolumns' => Array (
+            'disabled' => 'hidden',
+            'starttime' => 'starttime',
+        ),
+        'dynamicConfigFile' => t3lib_extMgm::extPath($_EXTKEY).'tca.php',
+        'iconfile' => 'default.gif',
+    ),
+    'feInterface' => Array (
+        'fe_admin_fieldList' => 'hidden, starttime, title, description, type, depth, table2index, alternative_source_pid, get_params, chashcalc, filepath, extensions',
+    )
+);
+
 ?>
\ No newline at end of file
index 00adacf..bcf4e75 100755 (executable)
@@ -4,30 +4,6 @@
 #--------------------------------------------------------
 
 
-#
-# Table structure for table 'index_fulltext'
-#
-CREATE TABLE index_fulltext (
-  phash int(11) DEFAULT '0' NOT NULL,
-  fulltextdata mediumtext NOT NULL,
-  PRIMARY KEY (phash)
-);
-
-
-#
-# Table structure for table 'index_grlist'
-#
-CREATE TABLE index_grlist (
-  phash int(11) DEFAULT '0' NOT NULL,
-  phash_x int(11) DEFAULT '0' NOT NULL,
-  hash_gr_list int(11) DEFAULT '0' NOT NULL,
-  gr_list tinytext NOT NULL,
-  uniqid int(11) DEFAULT '0' NOT NULL auto_increment,
-  PRIMARY KEY (uniqid),
-  KEY joinkey (phash,hash_gr_list),
-  KEY phash_grouping (phash_x,hash_gr_list)
-);
-
 
 #
 # Table structure for table 'index_phash'
@@ -42,7 +18,7 @@ CREATE TABLE index_phash (
   data_page_type tinyint(3) unsigned DEFAULT '0' NOT NULL,
   data_page_mp tinytext NOT NULL,
   gr_list tinytext NOT NULL,
-  item_type tinyint(4) DEFAULT '0' NOT NULL,
+  item_type varchar(5) DEFAULT '' NOT NULL,
   item_title tinytext NOT NULL,
   item_description tinytext NOT NULL,
   item_mtime int(11) DEFAULT '0' NOT NULL,
@@ -53,10 +29,21 @@ CREATE TABLE index_phash (
   parsetime int(11) DEFAULT '0' NOT NULL,
   sys_language_uid int(11) DEFAULT '0' NOT NULL,
   item_crdate int(11) DEFAULT '0' NOT NULL,
+  externalUrl tinyint(3) DEFAULT '0' NOT NULL,
+  recordUid int(11) DEFAULT '0' NOT NULL,
+  freeIndexUid int(11) DEFAULT '0' NOT NULL,
   PRIMARY KEY (phash),
   KEY phash_grouping (phash_grouping)
 );
 
+#
+# Table structure for table 'index_fulltext'
+#
+CREATE TABLE index_fulltext (
+  phash int(11) DEFAULT '0' NOT NULL,
+  fulltextdata mediumtext NOT NULL,
+  PRIMARY KEY (phash)
+);
 
 #
 # Table structure for table 'index_rel'
@@ -72,6 +59,18 @@ CREATE TABLE index_rel (
   KEY wid (wid,phash)
 );
 
+#
+# Table structure for table 'index_words'
+#
+CREATE TABLE index_words (
+  wid int(11) DEFAULT '0' NOT NULL,
+  baseword varchar(60) DEFAULT '' NOT NULL,
+  metaphone int(11) DEFAULT '0' NOT NULL,
+  is_stopword tinyint(3) DEFAULT '0' NOT NULL,
+  PRIMARY KEY (wid),
+  KEY baseword (baseword,wid),
+  KEY metaphone (metaphone,wid)
+);
 
 #
 # Table structure for table 'index_section'
@@ -91,6 +90,19 @@ CREATE TABLE index_section (
   KEY rl0_2 (rl0,phash)
 );
 
+#
+# Table structure for table 'index_grlist'
+#
+CREATE TABLE index_grlist (
+  phash int(11) DEFAULT '0' NOT NULL,
+  phash_x int(11) DEFAULT '0' NOT NULL,
+  hash_gr_list int(11) DEFAULT '0' NOT NULL,
+  gr_list tinytext NOT NULL,
+  uniqid int(11) DEFAULT '0' NOT NULL auto_increment,
+  PRIMARY KEY (uniqid),
+  KEY joinkey (phash,hash_gr_list),
+  KEY phash_grouping (phash_x,hash_gr_list)
+);
 
 #
 # Table structure for table 'index_stat_search'
@@ -120,15 +132,39 @@ CREATE TABLE index_stat_word (
   KEY tstamp (tstamp,word)
 );
 
-
 #
-# Table structure for table 'index_words'
+# Table structure for table 'index_fulltext'
 #
-CREATE TABLE index_words (
-  wid int(11) DEFAULT '0' NOT NULL,
-  baseword varchar(30) DEFAULT '' NOT NULL,
-  metaphone int(11) DEFAULT '0' NOT NULL,
-  PRIMARY KEY (wid),
-  KEY baseword (baseword,wid),
-  KEY metaphone (metaphone,wid)
-);
\ No newline at end of file
+CREATE TABLE index_debug (
+  phash int(11) DEFAULT '0' NOT NULL,
+  debuginfo mediumtext NOT NULL,
+  PRIMARY KEY (phash)
+);
+
+#
+# Table structure for table 'index_config'
+#
+CREATE TABLE index_config (
+    uid int(11) DEFAULT '0' NOT NULL auto_increment,
+    pid int(11) DEFAULT '0' NOT NULL,
+    tstamp int(11) unsigned DEFAULT '0' NOT NULL,
+    crdate int(11) unsigned DEFAULT '0' NOT NULL,
+    cruser_id int(11) unsigned DEFAULT '0' NOT NULL,
+    hidden tinyint(4) unsigned DEFAULT '0' NOT NULL,
+    starttime int(11) unsigned DEFAULT '0' NOT NULL,
+    title tinytext NOT NULL,
+    description text NOT NULL,
+    type int(11) unsigned DEFAULT '0' NOT NULL,
+    depth int(11) unsigned DEFAULT '0' NOT NULL,
+    table2index tinytext NOT NULL,
+    alternative_source_pid blob NOT NULL,
+    get_params tinytext NOT NULL,
+    fieldlist tinytext NOT NULL,
+       externalUrl tinytext NOT NULL,
+    chashcalc tinyint(3) unsigned DEFAULT '0' NOT NULL,
+    filepath tinytext NOT NULL,
+    extensions tinytext NOT NULL,
+
+    PRIMARY KEY (uid),
+    KEY parent (pid)
+);
diff --git a/typo3/sysext/indexed_search/locallang_db.xml b/typo3/sysext/indexed_search/locallang_db.xml
new file mode 100755 (executable)
index 0000000..ce089fa
--- /dev/null
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="utf-8" standalone="yes" ?>
+<T3locallang>
+    <meta type="array">
+        <description>Table labels for indexed_search</description>
+        <type>database</type>
+        <csh_table></csh_table>
+        <fileId>EXT:indexed_search/locallang_db.xml</fileId>
+        <labelContext type="array">
+            <label index="index_config"></label>
+            <label index="index_config.title"></label>
+            <label index="index_config.description"></label>
+            <label index="index_config.type.I.1"></label>
+            <label index="index_config.type.I.2"></label>
+            <label index="index_config.type.I.3"></label>
+            <label index="index_config.type"></label>
+            <label index="index_config.depth.I.0"></label>
+            <label index="index_config.depth.I.1"></label>
+            <label index="index_config.depth.I.2"></label>
+            <label index="index_config.depth.I.3"></label>
+            <label index="index_config.depth"></label>
+            <label index="index_config.table2index"></label>
+            <label index="index_config.alternative_source_pid"></label>
+            <label index="index_config.get_params"></label>
+            <label index="index_config.chashcalc"></label>
+            <label index="index_config.filepath"></label>
+            <label index="index_config.extensions"></label>
+        </labelContext>
+    </meta>
+    <data type="array">
+        <languageKey index="default" type="array">
+            <label index="index_config">Indexing Configuration</label>
+            <label index="index_config.title">Title:</label>
+            <label index="index_config.description">Description:</label>
+            <label index="index_config.type.I.1">Database Records</label>
+            <label index="index_config.type.I.2">Filepath on server</label>
+            <label index="index_config.type.I.3">External URL</label>
+            <label index="index_config.type">Type:</label>
+            <label index="index_config.depth.I.0">Single page</label>
+            <label index="index_config.depth.I.1">1 Level</label>
+            <label index="index_config.depth.I.2">2 Levels</label>
+            <label index="index_config.depth.I.3">3 Levels</label>
+            <label index="index_config.depth">Depth:</label>
+            <label index="index_config.table2index">Table to index:</label>
+            <label index="index_config.alternative_source_pid">Alternative Source Page:</label>
+            <label index="index_config.get_params">GET parameter string (with ###UID### substitution):</label>
+            <label index="index_config.fields">Fields (first is title):</label>
+            <label index="index_config.externalUrl">External URL:</label>
+            <label index="index_config.chashcalc">Calculate cHash (force caching)?</label>
+            <label index="index_config.filepath">Filepath:</label>
+            <label index="index_config.extensions">Limit to extensions (commalist):</label>
+        </languageKey>
+    </data>
+    <orig_hash type="array">
+        <languageKey index="default" type="array">
+            <label index="index_config" type="integer">260751312</label>
+            <label index="index_config.title" type="integer">85903807</label>
+            <label index="index_config.description" type="integer">218120871</label>
+            <label index="index_config.type.I.1" type="integer">202530923</label>
+            <label index="index_config.type.I.2" type="integer">43997271</label>
+            <label index="index_config.type.I.3" type="integer">76246465</label>
+            <label index="index_config.type" type="integer">241539922</label>
+            <label index="index_config.depth.I.0" type="integer">141943516</label>
+            <label index="index_config.depth.I.1" type="integer">159741036</label>
+            <label index="index_config.depth.I.2" type="integer">61164083</label>
+            <label index="index_config.depth.I.3" type="integer">113480435</label>
+            <label index="index_config.depth" type="integer">186534530</label>
+            <label index="index_config.table2index" type="integer">17017163</label>
+            <label index="index_config.alternative_source_pid" type="integer">198125496</label>
+            <label index="index_config.get_params" type="integer">218226603</label>
+            <label index="index_config.chashcalc" type="integer">230076146</label>
+            <label index="index_config.filepath" type="integer">219758268</label>
+            <label index="index_config.extensions" type="integer">228527306</label>
+        </languageKey>
+    </orig_hash>
+    <orig_text type="array">
+        <languageKey index="default" type="array">
+        </languageKey>
+    </orig_text>
+</T3locallang>
\ No newline at end of file
index 4b24565..f05b572 100755 (executable)
@@ -38,7 +38,7 @@ require ($BACK_PATH."template.php");
 $BE_USER->modAccess($MCONF,1);
 
 t3lib_extMgm::isLoaded("indexed_search",1);
-
+require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
 
 
 // ***************************
@@ -77,6 +77,11 @@ class SC_mod_tools_isearch_index {
                                "defCol" => Array('<TD><img src="'.$this->doc->backPath.'clear.gif" width=10 height=1></td><td valign="top" nowrap>','</td>')
                        )
                );
+
+               $indexer = t3lib_div::makeInstance('tx_indexedsearch_indexer');
+               $indexer->initializeExternalParsers();
+               debug(array_keys($indexer->external_parsers));
+               debug($indexer->internal_log);
        }
        function menuConfig()   {
                global $BE_USER,$LANG,$BACK_PATH,$TCA_DESCR,$TCA,$CLIENT,$TYPO3_CONF_VARS;
index 96e8f2a..12eeead 100755 (executable)
@@ -1,19 +1,19 @@
 <?php
 /***************************************************************
 *  Copyright notice
-*  
+*
 *  (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
 *  All rights reserved
 *
-*  This script is part of the TYPO3 project. The TYPO3 project is 
+*  This script is part of the TYPO3 project. The TYPO3 project is
 *  free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
-* 
+*
 *  The GNU General Public License can be found at
 *  http://www.gnu.org/copyleft/gpl.html.
-* 
+*
 *  This script is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *
 *  This copyright notice MUST APPEAR in all copies of the script!
 ***************************************************************/
-/** 
+/**
  * Module extension (addition to function menu) 'Indexed search' for the 'indexed_search' extension.
  *
  * @author    Kasper Skårhøj <kasperYYYY@typo3.com>
  */
+/**
+ * [CLASS/FUNCTION INDEX of SCRIPT]
+ *
+ *
+ *
+ *  110: class tx_indexedsearch_modfunc1 extends t3lib_extobjbase
+ *  124:     function modMenu()
+ *  148:     function main()
+ *
+ *              SECTION: Drawing table of indexed pages
+ *  261:     function drawTableOfIndexedPages()
+ *  312:     function indexed_info($data, $firstColContent)
+ *  398:     function printPhashRow($row,$grouping=0,$extraGrListRows)
+ *  539:     function printPhashRowHeader()
+ *  592:     function returnNumberOfColumns()
+ *
+ *              SECTION: Details display, phash row
+ *  628:     function showDetailsForPhash($phash)
+ *  745:     function listWords($ftrows,$header, $stopWordBoxes=FALSE, $page='')
+ *  794:     function listMetaphoneStat($ftrows,$header)
+ *  831:     function linkWordDetails($string,$wid)
+ *  843:     function linkMetaPhoneDetails($string,$metaphone)
+ *  853:     function flagsMsg($flags)
+ *
+ *              SECTION: Details display, words / metaphone
+ *  884:     function showDetailsForWord($wid)
+ *  943:     function showDetailsForMetaphone($metaphone)
+ *
+ *              SECTION: Helper functions
+ * 1014:     function printRemoveIndexed($phash,$alt)
+ * 1027:     function printReindex($resultRow,$alt)
+ * 1042:     function linkDetails($string,$phash)
+ * 1051:     function linkList()
+ * 1062:     function showPageDetails($string,$id)
+ * 1072:     function printExtraGrListRows($extraGrListRows)
+ * 1089:     function printRootlineInfo($row)
+ * 1123:     function makeItemTypeIcon($it,$alt='')
+ * 1148:     function utf8_to_currentCharset($string)
+ *
+ *              SECTION: Reindexing
+ * 1180:     function reindexPhash($phash, $pageId)
+ * 1234:     function getUidRootLineForClosestTemplate($id)
+ *
+ *              SECTION: Indexing of configurations
+ * 1275:     function extraIndexing()
+ * 1384:     function indexExtUrlRecursively($url, $depth, $pageId, $rl, $cfgUid)
+ *
+ *              SECTION: SQL functions
+ * 1441:     function removeIndexedPhashRow($phashList,$clearPageCache=1)
+ * 1478:     function getGrListEntriesForPhash($phash,$gr_list)
+ * 1498:     function processStopWords($stopWords)
+ * 1518:     function processPageKeywords($pageKeywords, $pageUid)
+ *
+ * TOTAL FUNCTIONS: 32
+ * (This index is automatically created/updated by the extension "extdeveval")
+ *
+ */
+
+
+require_once(PATH_t3lib.'class.t3lib_pagetree.php');
+require_once(PATH_t3lib.'class.t3lib_extobjbase.php');
+require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
+
 
+       // ... all for the rootline!
+require_once (PATH_t3lib."class.t3lib_page.php");
+require_once (PATH_t3lib."class.t3lib_tstemplate.php");
+require_once (PATH_t3lib."class.t3lib_tsparser_ext.php");
 
+       // Keywords mgm:
+require_once (PATH_t3lib."class.t3lib_tcemain.php");
 
-require_once(PATH_t3lib."class.t3lib_pagetree.php");
-require_once(PATH_t3lib."class.t3lib_extobjbase.php");
-require_once(t3lib_extMgm::extPath("indexed_search")."class.indexer.php");
 
+
+/**
+ * Indexing class for TYPO3 frontend
+ *
+ * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
+ * @package TYPO3
+ * @subpackage tx_indexedsearch
+ */
 class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
-       var $allPhashListed=array();
-       
+
+               // Internal, dynamic:
+       var $allPhashListed = array();          // phash values accumulations for link to clear all
+       var $external_parsers = array();        // External content parsers - objects set here with file extensions as keys.
+       var $iconFileNameCache = array();       // File extensions - icon map/cache.
+       var $indexerObj;                                        // Indexer object
+
+
+       /**
+        * Initialize menu array internally
+        *
+        * @return      void
+        */
     function modMenu()    {
         global $LANG;
-        
+
                return array (
-                       "depth" => array(
-                               0 => $LANG->sL("LLL:EXT:lang/locallang_core.php:labels.depth_0"),
-                               1 => $LANG->sL("LLL:EXT:lang/locallang_core.php:labels.depth_1"),
-                               2 => $LANG->sL("LLL:EXT:lang/locallang_core.php:labels.depth_2"),
-                               3 => $LANG->sL("LLL:EXT:lang/locallang_core.php:labels.depth_3"),
+                       'depth' => array(
+                               0 => $LANG->sL('LLL:EXT:lang/locallang_core.php:labels.depth_0'),
+                               1 => $LANG->sL('LLL:EXT:lang/locallang_core.php:labels.depth_1'),
+                               2 => $LANG->sL('LLL:EXT:lang/locallang_core.php:labels.depth_2'),
+                               3 => $LANG->sL('LLL:EXT:lang/locallang_core.php:labels.depth_3'),
+                       ),
+                       'type' => array(
+                               0 => 'Overview',
+                               1 => 'Technical Details',
+                               2 => 'Words and content',
+                               3 => 'Indexing'
                        )
                );
     }
 
+       /**
+        * Produces main content of the module
+        *
+        * @return      string          HTML output
+        */
     function main()    {
             // Initializes the module. Done in this function because we may need to re-initialize if data is submitted!
         global $SOBE,$BE_USER,$LANG,$BACK_PATH,$TCA_DESCR,$TCA,$CLIENT,$TYPO3_CONF_VARS;
-        
-               if ($this->pObj->id<=0) return;
 
-               if (t3lib_div::_GP("deletePhash"))      {
-                       $indexer = t3lib_div::makeInstance("tx_indexedsearch_indexer");
-                       $indexer->removeIndexedPhashRow(t3lib_div::_GP("deletePhash"));
+                       // Return if no page id:
+               if ($this->pObj->id<=0)         return;
+
+                       // Initialize max-list items
+               $this->maxListPerPage = t3lib_div::_GP('listALL') ? 100000 : 100;
+
+                       // Processing deletion of phash rows:
+               if (t3lib_div::_GP('deletePhash'))      {
+                       $this->removeIndexedPhashRow(t3lib_div::_GP('deletePhash'));
+               }
+
+                       // Processing stop-words:
+               if (t3lib_div::_POST('_stopwords'))     {
+                       $this->processStopWords(t3lib_div::_POST('stopWord'));
+               }
+
+                       // Processing stop-words:
+               if (t3lib_div::_POST('_pageKeywords'))  {
+                       $this->processPageKeywords(t3lib_div::_POST('pageKeyword'), t3lib_div::_POST('pageKeyword_pageUid'));
                }
 
+                       // Initialize external document parsers:
+                       // Example configuration, see ext_localconf.php of this file!
+               if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers']))        {
+                       foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef)    {
+                    &n