[TASK] Unify backend: header/section
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.lexer.php
index a3e04dc..12746a8 100755 (executable)
@@ -2,7 +2,7 @@
 /***************************************************************
 *  Copyright notice
 *
-*  (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
+*  (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
 *  All rights reserved
 *
 *  This script is part of the TYPO3 project. The TYPO3 project is
 /**
  * Lexer for indexed_search
  *
- * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
+ * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
  * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
  */
-/**
- * [CLASS/FUNCTION INDEX of SCRIPT]
- *
- *
- *
- *   91: class tx_indexedsearch_lexer
- *  105:     function tx_indexedsearch_lexer()
- *  117:     function split2Words($wordString)
- *
- *              SECTION: Helper functions
- *  176:     function utf8_ord(&$str, &$len, $pos=0, $hex=false)
- *  201:     function utf8_is_letter(&$str, &$len, $pos=0, $scan=false)
- *  284:     function get_word($charset, &$str, $pos=0)
- *
- * TOTAL FUNCTIONS: 5
- * (This index is automatically created/updated by the extension "extdeveval")
- *
- */
-
-
-
-/*
-
-DESCRIPTION OF (CJK) ALGORITHM
 
-  Continuous letters and numbers make up words.  Spaces and symbols
-  separate letters and numbers into words.  This is sufficient for
-  all western text.
 
-  CJK doesn't use spaces or separators to separate words, so the only
-  way to really find out what constitutes a word would be to have a
-  dictionary and advanced heuristics.  Instead, we form pairs from
-  consecutive characters, in such a way that searches will find only
-  characters that appear more-or-less the right sequence.  For example:
 
-    ABCDE => AB BC CD DE
 
-  This works okay since both the index and the search query is split
-  in the same manner, and since the set of characters is huge so the
-  extra matches are not significant.
-
-*/
 
 
 
@@ -84,17 +46,40 @@ DESCRIPTION OF (CJK) ALGORITHM
  * Lexer class for indexed_search
  * A lexer splits the text into words
  *
- * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
+ * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
  * @package TYPO3
  * @subpackage tx_indexedsearch
  */
 class tx_indexedsearch_lexer {
 
-       var $debug = FALSE;
+               // Debugging options:
+       var $debug = FALSE;             // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
        var $debugString = '';
 
-       var $csObj;             // Charset class object , t3lib_cs
-
+       /**
+        * Charset class object
+        *
+        * @var t3lib_cs
+        */
+       var $csObj;
+
+
+               // Configuration of the lexer:
+       var $lexerConf = array(
+               'printjoins' => array(  // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
+                       0x2e,   // "."
+                       0x2d,   // "-"
+                       0x5f,   // "_"
+                       0x3a,   // ":"
+                       0x2f,   // "/"
+                       0x27,   // "'"
+                       // 0x615,       // ARABIC SMALL HIGH TAH
+               ),
+               'casesensitive' => FALSE,       // Set, if case sensitive indexing is wanted.
+               'removeChars' => array(         // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
+                       0x2d    // "-"
+               )
+       );
 
 
        /**
@@ -102,10 +87,22 @@ class tx_indexedsearch_lexer {
         *
         * @return      void
         */
-       function tx_indexedsearch_lexer() {
-               $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
+       function __construct() {
+               $this->csObj = t3lib_div::makeInstance('t3lib_cs');
        }
 
+       /**
+        * Compatibility constructor.
+        *
+        * @deprecated since TYPO3 4.6 and will be removed in TYPO3 4.8. Use __construct() instead.
+        */
+       public function tx_indexedsearch_lexer() {
+               t3lib_div::logDeprecatedFunction();
+                       // Note: we cannot call $this->__construct() here because it would call the derived class constructor and cause recursion
+                       // This code uses official PHP behavior (http://www.php.net/manual/en/language.oop5.basic.php) when $this in the
+                       // statically called non-static method inherits $this from the caller's scope.
+               tx_indexedsearch_lexer::__construct();
+       }
 
        /**
         * Splitting string into words.
@@ -120,7 +117,9 @@ class tx_indexedsearch_lexer {
                $this->debugString = '';
 
                        // Then convert the string to lowercase:
-               $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
+               if (!$this->lexerConf['casesensitive']) {
+                       $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
+               }
 
                        // Now, splitting words:
                $len = 0;
@@ -130,18 +129,19 @@ class tx_indexedsearch_lexer {
                $this->debugString = '';
 
                while(1)        {
-                       list($start,$len) = $this->get_word('utf-8', $wordString, $pos);
+                       list($start,$len) = $this->get_word($wordString, $pos);
                        if ($len)       {
-                               $words[] = substr($wordString,$start,$len);
+
+                               $this->addWords($words, $wordString,$start,$len);
 
                                if ($this->debug)       {
-                                       $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.htmlspecialchars(substr($wordString,$start,$len));
+                                       $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
+                                                                               htmlspecialchars(substr($wordString,$start,$len));
                                }
 
                                $pos = $start+$len;
                        } else break;
                }
-
                return $words;
        }
 
@@ -156,37 +156,99 @@ class tx_indexedsearch_lexer {
 
 
 
-
-
-       /************************************
+       /**********************************
         *
         * Helper functions
         *
-        ************************************/
+        ********************************/
+
 
        /**
-        * Converts a UTF-8 multibyte character to a UNICODE codepoint
+        * Add word to word-array
+        * This function should be used to make sure CJK sequences are split up in the right way
         *
-        * @param       string          UTF-8 multibyte character string (reference)
-        * @param       integer         The length of the character (reference, return value)
+        * @param       array           Array of accumulated words
+        * @param       string          Complete Input string from where to extract word
+        * @param       integer         Start position of word in input string
+        * @param       integer         The Length of the word string from start position
+        * @return      void
+        */
+       function addWords(&$words, &$wordString, $start, $len)  {
+
+                       // Get word out of string:
+               $theWord = substr($wordString,$start,$len);
+
+                       // Get next chars unicode number and find type:
+               $bc = 0;
+               $cp = $this->utf8_ord($theWord, $bc);
+               list($cType) = $this->charType($cp);
+
+                       // If string is a CJK sequence we follow this algorithm:
+                       /*
+                               DESCRIPTION OF (CJK) ALGORITHM
+
+                               Continuous letters and numbers make up words. Spaces and symbols
+                               separate letters and numbers into words. This is sufficient for
+                               all western text.
+
+                               CJK doesn't use spaces or separators to separate words, so the only
+                               way to really find out what constitutes a word would be to have a
+                               dictionary and advanced heuristics. Instead, we form pairs from
+                               consecutive characters, in such a way that searches will find only
+                               characters that appear more-or-less the right sequence. For example:
+
+                                       ABCDE => AB BC CD DE
+
+                               This works okay since both the index and the search query is split
+                               in the same manner, and since the set of characters is huge so the
+                               extra matches are not significant.
+
+                               (Hint taken from ZOPEs chinese user group)
+
+                               [Kasper: As far as I can see this will only work well with or-searches!]
+                       */
+               if ($cType == 'cjk')    {
+                               // Find total string length:
+                       $strlen = $this->csObj->utf8_strlen($theWord);
+
+                               // Traverse string length and add words as pairs of two chars:
+                       for ($a=0; $a<$strlen; $a++)    {
+                               if ($strlen==1 || $a<$strlen-1) {
+                                       $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
+                               }
+                       }
+               } else {        // Normal "single-byte" chars:
+                               // Remove chars:
+                       foreach($this->lexerConf['removeChars'] as $skipJoin)   {
+                               $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
+                       }
+                               // Add word:
+                       $words[] = $theWord;
+               }
+       }
+
+       /**
+        * Get the first word in a given utf-8 string (initial non-letters will be skipped)
+        *
+        * @param       string          Input string (reference)
         * @param       integer         Starting position in input string
-        * @param       boolean         If set, then a hex. number is returned
-        * @return      integer         UNICODE codepoint
+        * @return      array           0: start, 1: len or FALSE if no word has been found
         */
-       function utf8_ord(&$str, &$len, $pos=0, $hex=false)     {
-               $ord = ord($str{$pos});
-               $len = 1;
+       function get_word(&$str, $pos=0)        {
 
-               if ($ord > 0x80)        {
-                       for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of extra bytes
-                       $len += $bc;
+               $len=0;
 
-                       $ord = $ord & ((1 << (6-$bc)) - 1);     // mask utf-8 lead-in bytes
-                       for ($i=$pos+1; $bc; $bc--, $i++)       // "bring in" data bytes
-                               $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
+                       // If return is TRUE, a word was found starting at this position, so returning position and length:
+               if ($this->utf8_is_letter($str, $len, $pos))    {
+                       return array($pos,$len);
                }
 
-               return $hex ? 'x'.dechex($ord) : $ord;
+                       // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
+               $pos += $len;
+               if ($str{$pos} == '')   return FALSE;   // check end of string before looking for word of course.
+
+               $this->utf8_is_letter($str, $len, $pos);
+               return array($pos,$len);
        }
 
        /**
@@ -195,111 +257,149 @@ class tx_indexedsearch_lexer {
         * @param       string          Input string (reference)
         * @param       integer         Byte-length of character sequence (reference, return value)
         * @param       integer         Starting position in input string
-        * @param       boolean         If set will scan for a whole sequence of characters
         * @return      boolean         letter (or word) found
         */
-       function utf8_is_letter(&$str, &$len, $pos=0, $scan=false)      {
+       function utf8_is_letter(&$str, &$len, $pos=0)   {
                global $cs;
 
                $len = 0;
                $bc = 0;
-               $found = false; // found a letter
-               $letter = true; // looking for a letter?
+               $cType = $cType_prev = FALSE; // Letter type
+               $letter = TRUE; // looking for a letter?
 
-               if ($str{$pos} == '')   return false;
+               if ($str{$pos} == '')   return FALSE;   // Return FALSE on end-of-string at this stage
 
                while(1) {
+
+                               // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
                        if ($len)       {
-                               if ($scan)      {
-                                       if ($letter && !$found) {       // end of word reached
-                                               return true;
-                                       }
-                                       elseif (!$letter && $found)     {       // end of non-word reached
-                                               return false;
+                               if ($letter)    {       // We are in a sequence of words
+                                       if (!$cType     // The char was NOT a letter
+                                                       || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev))       // ... or the previous and current char are from single-byte sets vs. asian CJK sets
+                                                       )       {
+                                                       // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
+                                               if (!in_array($cp,$this->lexerConf['printjoins']))      {
+                                                               // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
+                                                       if ($printJoinLgd)      {
+                                                               $len = $printJoinLgd;
+                                                       }
+                                                       #debug($cp);
+                                                       return TRUE;
+                                               } else {        // If a printJoin char is found, record the length if it has not been recorded already:
+                                                       if (!$printJoinLgd)     $printJoinLgd = $len;
+                                               }
+                                       } else {        // When a true letter is found, reset printJoinLgd counter:
+                                               $printJoinLgd = 0;
                                        }
                                }
-                               else    {
-                                       return $found;  // report single letter status
+                               elseif (!$letter && $cType)     {       // end of non-word reached
+                                       return FALSE;
                                }
                        }
                        $len += $bc;    // add byte-length of last found character
-                       $found = false;
 
-                       if ($str{$pos} == '')   return $letter; // end of string
+                       if ($str{$pos} == '')   return $letter; // end of string; return status of string till now
 
+                               // Get next chars unicode number:
                        $cp = $this->utf8_ord($str,$bc,$pos);
                        $pos += $bc;
 
-                       if ($cp >= 0x41 && $cp <= 0x5A ||       // Basic Latin: capital letters
-                           $cp >= 0x30 && $cp <= 0x39 ||       // Numbers
-                               $cp >= 0x61 && $cp <= 0x7A)     {       //              small letters
-                               $found = true;
+                               // Determine the type:
+                       $cType_prev = $cType;
+                       list($cType) = $this->charType($cp);
+                       if ($cType)     {
                                continue;
                        }
 
-                       if ($cp >= 0xC0 && $cp <= 0xFF) {       // Latin-1 Supplement (0x80-0xFF)
-                               // 0x80-0x9F are unassigned
-                               // 0xA0-0xBF are non-letters
+                               // Setting letter to FALSE if the first char was not a letter!
+                       if (!$len)      $letter = FALSE;
+               }
 
-                               if ($cp != 0xD7 && $cp != 0xF7) {       // multiplication and division sign
-                                       $found = true;
-                                       continue;
-                               }
-                       } elseif ($cp >= 0x100 && $cp < 0x280)  {       // Latin Extended-A and -B
-                               $found = true;
-                               continue;
-                       } elseif ($cp >= 0x370 && $cp < 0x400)  {       // Greek and Coptic
-                               $found = true;
-                               continue;
-                       } elseif ($cp >= 0x400 && $cp < 0x530)  {       // Cyrillic and Cyrillic Supplement
-                               $found = true;
-                               continue;
-                       } elseif ($cp >= 0x590 && $cp < 0x600)  {       // Hebrew
-                               $found = true;
-                               continue;
-                       } elseif ($cp >= 0x600 && $cp < 0x700)  {       // Arabic
-                               $found = true;
-                               continue;
-                       }
-                               // I dont't think we need to support these:
-                               //  Latin Extended Additional
-                               //  Greek Extended
-                               //  Alphabetic Presentation Forms
-                               //  Arabic Presentation Forms-A
-                               //  Arabic Presentation Forms-B
-
-                       if (!$len)      $letter = false;
+               return FALSE;
+       }
+
+       /**
+        * Determine the type of character
+        *
+        * @param       integer         Unicode number to evaluate
+        * @return      array           Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
+        */
+       function charType($cp)  {
+
+                       // Numeric?
+               if (
+                               ($cp >= 0x30 && $cp <= 0x39)            // Arabic
+/*
+                               ($cp >= 0x660 && $cp <= 0x669) ||       // Arabic-Indic
+                               ($cp >= 0x6F0 && $cp <= 0x6F9) ||       // Arabic-Indic (Iran, Pakistan, and India)
+                               ($cp >= 0x3021 && $cp <= 0x3029) ||     // Hangzhou
+*/
+                       )       {
+                       return array('num');
                }
 
-               return false;
+                       // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
+               if (
+                               ($cp >= 0x41 && $cp <= 0x5A) ||         // Basic Latin: capital letters
+                               ($cp >= 0x61 && $cp <= 0x7A) ||         // Basic Latin: small letters
+                               ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) ||                   // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
+                               ($cp >= 0x100 && $cp < 0x280) ||        // Latin Extended-A and -B
+                               ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters
+                               (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) ||             // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
+                               (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) ||     // Hebrew: only accents and letters
+                               (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E &&  $cp <= 0x6D3)) ||  // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
+                               ($cp >= 0x1E00 && $cp < 0x2000)         // Latin Extended Additional and Greek Extended
+                       )       {
+                       return array('alpha');
+               }
+
+                       // Looking for CJK (Chinese / Japanese / Korean)
+                       // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
+                       // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
+               if (
+                               ($cp >= 0x3040 && $cp <= 0x30FF) ||             // HIRAGANA and KATAKANA letters
+                               ($cp >= 0x3130 && $cp <= 0x318F) ||             // Hangul Compatibility Jamo
+                               ($cp >= 0x3400 && $cp <= 0x4DBF) ||             // CJK Unified Ideographs Extension A
+                               ($cp >= 0x4E00 && $cp <= 0x9FAF) ||             // CJK Unified Ideographs
+                               ($cp >= 0xAC00 && $cp <= 0xD7AF) ||             // Hangul Syllables
+                               ($cp >= 0x20000 && $cp <= 0x2FA1F)              // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement
+                                                                                                               // also include CJK and Kangxi radicals or Bopomofo letter?
+                       )       {
+                       return array('cjk');
+               }
        }
 
        /**
-        * Get the first word in a given string (initial non-letters will be skipped)
+        * Converts a UTF-8 multibyte character to a UNICODE codepoint
         *
-        * @param       string          The charset
-        * @param       string          Input string (reference)
+        * @param       string          UTF-8 multibyte character string (reference)
+        * @param       integer         The length of the character (reference, return value)
         * @param       integer         Starting position in input string
-        * @return      array           0: start, 1: len or false if no word has been found
+        * @param       boolean         If set, then a hex. number is returned
+        * @return      integer         UNICODE codepoint
         */
-       function get_word($charset, &$str, $pos=0)      {
-               if ($charset == 'utf-8')        {
-                       $letters = $this->utf8_is_letter($str, $len, $pos, true);
-                       if ($letters)   return array($pos,$len);        // word found
+       function utf8_ord(&$str, &$len, $pos=0, $hex=FALSE)     {
+               $ord = ord($str{$pos});
+               $len = 1;
 
-                       $pos += $len;
-                       if ($str{$pos} == '')   return false;   // end of string
+               if ($ord > 0x80)        {
+                       for ($bc = -1, $mbs = $ord; $mbs & 0x80; $mbs = $mbs << 1) {
+                                       // calculate number of extra bytes
+                               $bc++;
+                       }
+                       $len += $bc;
 
-                       $this->utf8_is_letter($str, $len, $pos, true);
-                       return array($pos,$len);
+                       $ord = $ord & ((1 << (6-$bc)) - 1);     // mask utf-8 lead-in bytes
+                       for ($i=$pos+1; $bc; $bc--, $i++)       // "bring in" data bytes
+                               $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
                }
 
-               return false;
+               return $hex ? 'x'.dechex($ord) : $ord;
        }
 }
 
 
-if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php'])    {
-    include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
+if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php'])) {
+       include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
 }
 ?>
\ No newline at end of file