Updating alpha chars info in charType().
authorMartin Kutschker <martin.t.kutschker@blackbox.net>
Tue, 8 Mar 2005 12:16:03 +0000 (12:16 +0000)
committerMartin Kutschker <martin.t.kutschker@blackbox.net>
Tue, 8 Mar 2005 12:16:03 +0000 (12:16 +0000)
git-svn-id: https://svn.typo3.org/TYPO3v4/Core/trunk@573 709f56b5-9817-0410-a4d7-c38de5d9e867

typo3/sysext/indexed_search/class.lexer.php

index 4d49182..b58b490 100755 (executable)
@@ -87,8 +87,9 @@ class tx_indexedsearch_lexer {
                        0x5f,   // "_"
                        0x3a,   // ":"
                        0x2f,   // "/"
-                       0x2d,   // "-"
+                       0x2d,   // "-" DUPE
                        0x27,   // "'"
+       // 0x615 ARABIC SMALL HIGH TAH
                ),
                'casesensitive' => FALSE,       // Set, if case sensitive indexing is wanted.
                'removeChars' => array(         // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
@@ -329,20 +330,28 @@ class tx_indexedsearch_lexer {
        function charType($cp)  {
 
                        // Numeric?
-               if ($cp >= 0x30 && $cp <= 0x39) {
+               if (
+                               ($cp >= 0x30 && $cp <= 0x39)            // Arabic
+/*
+                               ($cp >= 0x660 && $cp <= 0x669) ||       // Arabic-Indic
+                               ($cp >= 0x6F0 && $cp <= 0x6F9) ||       // Arabic-Indic (Iran, Pakistan, and India)
+                               ($cp >= 0x3021 && $cp <= 0x3029) ||     // Hangzhou
+*/
+                       )       {
                        return array('num');
                }
 
-                       // LOOKING for Alpha chars:
+                       // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
                if (
                                ($cp >= 0x41 && $cp <= 0x5A) ||         // Basic Latin: capital letters
-                               ($cp >= 0x61 && $cp <= 0x7A) ||         // small letters
-                               ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7)      || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
+                               ($cp >= 0x61 && $cp <= 0x7A) ||         // Basic Latin: small letters
+                               ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) ||                   // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
                                ($cp >= 0x100 && $cp < 0x280) ||        // Latin Extended-A and -B
-                               ($cp >= 0x370 && $cp < 0x400) ||        // Greek and Coptic
-                               ($cp >= 0x400 && $cp < 0x530) ||        // Cyrillic and Cyrillic Supplement
-                               ($cp >= 0x590 && $cp < 0x600) ||        // Hebrew
-                               ($cp >= 0x600 && $cp < 0x700)           // Arabic
+                               ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) // Greek and Coptic excluding non-letters
+                               (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) ||             // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
+                               (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) ||     // Hebrew: only accents and letters
+                               (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E &&  $cp <= 0x6D3)) ||  // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
+                               ($cp >= 0x1E00 && $cp < 0x2000)         // Latin Extended Additional and Greek Extended
                        )       {
                        return array('alpha');
                }