Commit 8652db6b authored by Kasper Skårhøj's avatar Kasper Skårhøj
Browse files

* The indexed-search lexer has been updated so it supports "printjoins" (characters like ' or - which are allowed inside of words) and there is also support for Chinese/Japanese/Korean (CJK) indexing/searching.


git-svn-id: https://svn.typo3.org/TYPO3v4/Core/trunk@562 709f56b5-9817-0410-a4d7-c38de5d9e867
parent c98a7b93
2005-02-15 Kasper Skårhøj,,, <kasper@typo3.com>
* The indexed-search lexer has been updated so it supports "printjoins" (characters like ' or - which are allowed inside of words) and there is also support for Chinese/Japanese/Korean (CJK) indexing/searching.
2005-02-14 Kasper Skårhøj,,, <kasper@typo3.com>
* Large number of updates to system extension indexed_search, in particular to the frontend search plugin which is now ready for implementation of templating engine by workgroup. still missing work on the lexer to support CJK content.
......
This diff is collapsed.
......@@ -162,8 +162,8 @@ http://www.microsoft.com/globaldev/reference/dbcs/932.htm
gb2312.tbl
Microsoft Windows Codepage : 936 (Simplified Chinese GBK)
gb2312 936 Chinese Simplified (GB2312)
gb_2312-80 936 Chinese Simplified (GB2312)
gb2312 936 Chinese Simplified (GB2312)
gb_2312-80 936 Chinese Simplified (GB2312)
http://www.microsoft.com/globaldev/reference/dbcs/936.htm
(Multibyte)
Note: this is a MS-specific superset of the real GB2312
......
......@@ -1438,7 +1438,7 @@ EXTENSION KEYS:
$this->content.=$this->doc->section('Extension files',$content,0,1);
break;
case 'updateModule':
$this->content.=$this->doc->section('Update:',$updateObj->main(),0,1);
$this->content.=$this->doc->section('Update:',is_object($updateObj) ? $updateObj->main() : 'No update object',0,1);
break;
default:
$this->extObjContent();
......
......@@ -35,20 +35,22 @@
*
*
*
* 73: class tx_indexed_search_extparse
* 90: function initParser($extension)
* 215: function initBackend($extension)
* 75: class tx_indexed_search_extparse
* 94: function initParser($extension)
* 227: function softInit($extension)
* 257: function searchTypeMediaTitle($extension)
* 330: function isMultiplePageExtension($extension)
*
* SECTION: Reading documents (for parsing)
* 261: function readFileContent($ext,$absFile,$cPKey)
* 441: function fileContentParts($ext,$absFile)
* 480: function splitPdfInfo($pdfInfoArray)
* 499: function removeEndJunk($string)
* 361: function readFileContent($ext,$absFile,$cPKey)
* 541: function fileContentParts($ext,$absFile)
* 580: function splitPdfInfo($pdfInfoArray)
* 599: function removeEndJunk($string)
*
* SECTION: Backend analyzer
* 526: function getIcon($extension)
* 626: function getIcon($extension)
*
* TOTAL FUNCTIONS: 7
* TOTAL FUNCTIONS: 9
* (This index is automatically created/updated by the extension "extdeveval")
*
*/
......
......@@ -84,37 +84,37 @@
*
* SECTION: SQL; External media
* 1387: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
* 1445: function submitFile_grlist($hash)
* 1459: function submitFile_section($hash)
* 1473: function removeOldIndexedFiles($phash)
* 1449: function submitFile_grlist($hash)
* 1463: function submitFile_section($hash)
* 1477: function removeOldIndexedFiles($phash)
*
* SECTION: SQL Helper functions
* 1509: function checkMtimeTstamp($mtime,$phash)
* 1545: function checkContentHash()
* 1562: function checkExternalDocContentHash($hashGr,$content_md5h)
* 1576: function is_grlist_set($phash_x)
* 1589: function update_grlist($phash,$phash_x)
* 1604: function updateTstamp($phash,$mtime=0)
* 1620: function updateParsetime($phash,$parsetime)
* 1633: function updateRootline()
* 1648: function getRootLineFields(&$fieldArr)
* 1667: function removeLoginpagesWithContentHash()
* 1513: function checkMtimeTstamp($mtime,$phash)
* 1549: function checkContentHash()
* 1566: function checkExternalDocContentHash($hashGr,$content_md5h)
* 1580: function is_grlist_set($phash_x)
* 1593: function update_grlist($phash,$phash_x)
* 1608: function updateTstamp($phash,$mtime=0)
* 1624: function updateParsetime($phash,$parsetime)
* 1637: function updateRootline()
* 1652: function getRootLineFields(&$fieldArr)
* 1671: function removeLoginpagesWithContentHash()
*
* SECTION: SQL; Submitting words
* 1702: function checkWordList($wl)
* 1739: function submitWords($wl,$phash)
* 1763: function freqMap($freq)
* 1706: function checkWordList($wl)
* 1743: function submitWords($wl,$phash)
* 1767: function freqMap($freq)
*
* SECTION: Hashing
* 1796: function setT3Hashes()
* 1822: function setExtHashes($file,$subinfo=array())
* 1846: function md5inthash($str)
* 1856: function makeCHash($paramArray)
* 1800: function setT3Hashes()
* 1826: function setExtHashes($file,$subinfo=array())
* 1850: function md5inthash($str)
* 1860: function makeCHash($paramArray)
*
* SECTION: Internal logging functions
* 1898: function log_push($msg,$key)
* 1907: function log_pull()
* 1918: function log_setTSlogMessage($msg, $errorNum=0)
* 1902: function log_push($msg,$key)
* 1911: function log_pull()
* 1922: function log_setTSlogMessage($msg, $errorNum=0)
*
* TOTAL FUNCTIONS: 55
* (This index is automatically created/updated by the extension "extdeveval")
......
......@@ -35,43 +35,25 @@
*
*
*
* 91: class tx_indexedsearch_lexer
* 105: function tx_indexedsearch_lexer()
* 117: function split2Words($wordString)
* 73: class tx_indexedsearch_lexer
* 104: function tx_indexedsearch_lexer()
* 115: function split2Words($wordString)
*
* SECTION: Helper functions
* 176: function utf8_ord(&$str, &$len, $pos=0, $hex=false)
* 201: function utf8_is_letter(&$str, &$len, $pos=0, $scan=false)
* 284: function get_word($charset, &$str, $pos=0)
* 178: function addWords(&$words, &$wordString, $start, $len)
* 239: function get_word(&$str, $pos=0)
* 264: function utf8_is_letter(&$str, &$len, $pos=0)
* 328: function charType($cp)
* 371: function utf8_ord(&$str, &$len, $pos=0, $hex=false)
*
* TOTAL FUNCTIONS: 5
* TOTAL FUNCTIONS: 7
* (This index is automatically created/updated by the extension "extdeveval")
*
*/
/*
DESCRIPTION OF (CJK) ALGORITHM
Continuous letters and numbers make up words. Spaces and symbols
separate letters and numbers into words. This is sufficient for
all western text.
CJK doesn't use spaces or separators to separate words, so the only
way to really find out what constitutes a word would be to have a
dictionary and advanced heuristics. Instead, we form pairs from
consecutive characters, in such a way that searches will find only
characters that appear more-or-less the right sequence. For example:
ABCDE => AB BC CD DE
This works okay since both the index and the search query is split
in the same manner, and since the set of characters is huge so the
extra matches are not significant.
*/
......@@ -90,12 +72,30 @@ DESCRIPTION OF (CJK) ALGORITHM
*/
class tx_indexedsearch_lexer {
var $debug = FALSE;
// Debugging options:
var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
var $debugString = '';
var $csObj; // Charset class object , t3lib_cs
// Configuration of the lexer:
var $lexerConf = array(
'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
0x2e, // "."
0x2d, // "-"
0x5f, // "_"
0x3a, // ":"
0x2f, // "/"
0x2d, // "-"
0x27, // "'"
),
'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted.
'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
0x2d // "-"
)
);
/**
* Constructor: Initializes the charset class, t3lib_cs
......@@ -106,7 +106,6 @@ class tx_indexedsearch_lexer {
$this->csObj = &t3lib_div::makeInstance('t3lib_cs');
}
/**
* Splitting string into words.
* Used for indexing, can also be used to find words in query.
......@@ -120,7 +119,9 @@ class tx_indexedsearch_lexer {
$this->debugString = '';
// Then convert the string to lowercase:
$wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
if (!$this->lexerConf['casesensitive']) {
$wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
}
// Now, splitting words:
$len = 0;
......@@ -130,12 +131,14 @@ class tx_indexedsearch_lexer {
$this->debugString = '';
while(1) {
list($start,$len) = $this->get_word('utf-8', $wordString, $pos);
list($start,$len) = $this->get_word($wordString, $pos);
if ($len) {
$words[] = substr($wordString,$start,$len);
$this->addWords($words, $wordString,$start,$len);
if ($this->debug) {
$this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.htmlspecialchars(substr($wordString,$start,$len));
$this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
htmlspecialchars(substr($wordString,$start,$len));
}
$pos = $start+$len;
......@@ -156,37 +159,99 @@ class tx_indexedsearch_lexer {
/************************************
/**********************************
*
* Helper functions
*
************************************/
********************************/
/**
* Converts a UTF-8 multibyte character to a UNICODE codepoint
* Add word to word- array
* This function should be used to make sure CJK sequences are split up in the right way
*
* @param string UTF-8 multibyte character string (reference)
* @param integer The length of the character (reference, return value)
* @param array Array of accumulated words
* @param string Complete Input string from where to extract word
* @param integer Start position of word in input string
* @param integer The Length of the word string from start position
* @return void
*/
function addWords(&$words, &$wordString, $start, $len) {
// Get word out of string:
$theWord = substr($wordString,$start,$len);
// Get next chars unicode number and find type:
$bc = 0;
$cp = $this->utf8_ord($theWord, $bc);
list($cType) = $this->charType($cp);
// If string is a CJK sequence we follow this algorithm:
/*
DESCRIPTION OF (CJK) ALGORITHM
Continuous letters and numbers make up words. Spaces and symbols
separate letters and numbers into words. This is sufficient for
all western text.
CJK doesn't use spaces or separators to separate words, so the only
way to really find out what constitutes a word would be to have a
dictionary and advanced heuristics. Instead, we form pairs from
consecutive characters, in such a way that searches will find only
characters that appear more-or-less the right sequence. For example:
ABCDE => AB BC CD DE
This works okay since both the index and the search query is split
in the same manner, and since the set of characters is huge so the
extra matches are not significant.
(Hint taken from ZOPEs chinese user group)
[Kasper: As far as I can see this will only work well with or-searches!]
*/
if ($cType == 'cjk') {
// Find total string length:
$strlen = $this->csObj->utf8_strlen($theWord);
// Traverse string length and add words as pairs of two chars:
for ($a=0; $a<$strlen; $a++) {
if ($strlen==1 || $a<$strlen-1) {
$words[] = $this->csObj->utf8_substr($theWord, $a, 2);
}
}
} else { // Normal "single-byte" chars:
// Remove chars:
foreach($this->lexerConf['removeChars'] as $skipJoin) {
$theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
}
// Add word:
$words[] = $theWord;
}
}
/**
* Get the first word in a given utf-8 string (initial non-letters will be skipped)
*
* @param string Input string (reference)
* @param integer Starting position in input string
* @param boolean If set, then a hex. number is returned
* @return integer UNICODE codepoint
* @return array 0: start, 1: len or false if no word has been found
*/
function utf8_ord(&$str, &$len, $pos=0, $hex=false) {
$ord = ord($str{$pos});
$len = 1;
function get_word(&$str, $pos=0) {
if ($ord > 0x80) {
for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of extra bytes
$len += $bc;
$len=0;
$ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes
$ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
// If return is true, a word was found starting at this position, so returning position and length:
if ($this->utf8_is_letter($str, $len, $pos)) {
return array($pos,$len);
}
return $hex ? 'x'.dechex($ord) : $ord;
// If the return value was false it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
$pos += $len;
if ($str{$pos} == '') return false; // check end of string before looking for word of course.
$this->utf8_is_letter($str, $len, $pos);
return array($pos,$len);
}
/**
......@@ -195,79 +260,61 @@ class tx_indexedsearch_lexer {
* @param string Input string (reference)
* @param integer Byte-length of character sequence (reference, return value)
* @param integer Starting position in input string
* @param boolean If set will scan for a whole sequence of characters
* @return boolean letter (or word) found
*/
function utf8_is_letter(&$str, &$len, $pos=0, $scan=false) {
function utf8_is_letter(&$str, &$len, $pos=0) {
global $cs;
$len = 0;
$bc = 0;
$found = false; // found a letter
$cType = $cType_prev = false; // Letter type
$letter = true; // looking for a letter?
if ($str{$pos} == '') return false;
if ($str{$pos} == '') return false; // Return false on end-of-string at this stage
while(1) {
// If characters has been obtained we will know whether the string starts as a sequence of letters or not:
if ($len) {
if ($scan) {
if ($letter && !$found) { // end of word reached
return true;
}
elseif (!$letter && $found) { // end of non-word reached
return false;
if ($letter) { // We are in a sequence of words
if (!$cType // The char was NOT a letter
|| ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets
) {
// Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
if (!in_array($cp,$this->lexerConf['printjoins'])) {
// If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
if ($printJoinLgd) {
$len = $printJoinLgd;
}
#debug($cp);
return true;
} else { // If a printJoin char is found, record the length if it has not been recorded already:
if (!$printJoinLgd) $printJoinLgd = $len;
}
} else { // When a true letter is found, reset printJoinLgd counter:
$printJoinLgd = 0;
}
}
else {
return $found; // report single letter status
elseif (!$letter && $cType) { // end of non-word reached
return false;
}
}
$len += $bc; // add byte-length of last found character
$found = false;
if ($str{$pos} == '') return $letter; // end of string
if ($str{$pos} == '') return $letter; // end of string; return status of string till now
// Get next chars unicode number:
$cp = $this->utf8_ord($str,$bc,$pos);
$pos += $bc;
if ($cp >= 0x41 && $cp <= 0x5A || // Basic Latin: capital letters
$cp >= 0x30 && $cp <= 0x39 || // Numbers
$cp >= 0x61 && $cp <= 0x7A) { // small letters
$found = true;
// Determine the type:
$cType_prev = $cType;
list($cType) = $this->charType($cp);
if ($cType) {
continue;
}
if ($cp >= 0xC0 && $cp <= 0xFF) { // Latin-1 Supplement (0x80-0xFF)
// 0x80-0x9F are unassigned
// 0xA0-0xBF are non-letters
if ($cp != 0xD7 && $cp != 0xF7) { // multiplication and division sign
$found = true;
continue;
}
} elseif ($cp >= 0x100 && $cp < 0x280) { // Latin Extended-A and -B
$found = true;
continue;
} elseif ($cp >= 0x370 && $cp < 0x400) { // Greek and Coptic
$found = true;
continue;
} elseif ($cp >= 0x400 && $cp < 0x530) { // Cyrillic and Cyrillic Supplement
$found = true;
continue;
} elseif ($cp >= 0x590 && $cp < 0x600) { // Hebrew
$found = true;
continue;
} elseif ($cp >= 0x600 && $cp < 0x700) { // Arabic
$found = true;
continue;
}
// I dont't think we need to support these:
// Latin Extended Additional
// Greek Extended
// Alphabetic Presentation Forms
// Arabic Presentation Forms-A
// Arabic Presentation Forms-B
// Setting letter to false if the first char was not a letter!
if (!$len) $letter = false;
}
......@@ -275,26 +322,68 @@ class tx_indexedsearch_lexer {
}
/**
* Get the first word in a given string (initial non-letters will be skipped)
* Determine the type of character
*
* @param string The charset
* @param string Input string (reference)
* @param integer Unicode number to evaluate
* @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
*/
function charType($cp) {
// Numeric?
if ($cp >= 0x30 && $cp <= 0x39) {
return array('num');
}
// LOOKING for Alpha chars:
if (
($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters
($cp >= 0x61 && $cp <= 0x7A) || // small letters
($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B
($cp >= 0x370 && $cp < 0x400) || // Greek and Coptic
($cp >= 0x400 && $cp < 0x530) || // Cyrillic and Cyrillic Supplement
($cp >= 0x590 && $cp < 0x600) || // Hebrew
($cp >= 0x600 && $cp < 0x700) // Arabic
) {
return array('alpha');
}
// Looking for CJK (Chinese / Japanese / Korean)
// Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
if (
($cp >= 0x4E02 && $cp <= 0x9FA5) || // CJK UNIFIED IDEOGRAPH
($cp >= 0xAC02 && $cp <= 0xD79D) || // HANGUL SYLLABLE
($cp >= 0x3131 && $cp <= 0x318E) || // HANGUL LETTER
($cp >= 0x3041 && $cp <= 0x3093) || // HIRAGANA letters
($cp >= 0x30A1 && $cp <= 0x30F6) // KATAKANA letters
) {
return array('cjk');
}
}
/**
* Converts a UTF-8 multibyte character to a UNICODE codepoint
*
* @param string UTF-8 multibyte character string (reference)
* @param integer The length of the character (reference, return value)
* @param integer Starting position in input string
* @return array 0: start, 1: len or false if no word has been found
* @param boolean If set, then a hex. number is returned
* @return integer UNICODE codepoint
*/
function get_word($charset, &$str, $pos=0) {
if ($charset == 'utf-8') {
$letters = $this->utf8_is_letter($str, $len, $pos, true);
if ($letters) return array($pos,$len); // word found
function utf8_ord(&$str, &$len, $pos=0, $hex=false) {
$ord = ord($str{$pos});
$len = 1;
$pos += $len;
if ($str{$pos} == '') return false; // end of string
if ($ord > 0x80) {
for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of extra bytes
$len += $bc;
$this->utf8_is_letter($str, $len, $pos, true);
return array($pos,$len);
$ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes
$ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
}
return false;
return $hex ? 'x'.dechex($ord) : $ord;
}
}
......
TIRSDAG:
- Improve lexer:
- See BASIC_LEXER from Oracle
- CJK hack from Zope.
- Test
- Implement in search query analysis.
- TESTING with russian, danish, chinese, japanese etc...
- CVS
- split search words by new lexer function! (getSearchWords)
ONSDAG:
- Crawler
TORSDAG / FREDAG:
- Test / Koordinering.
......@@ -21,15 +10,16 @@ Search test:
- external media respect privacy of pages?
- external media on multiple pages with DIFFERENT languages?
- checkResume: Should it also check for gr_list "0,-1"?
- case-sensitivity?
- Warning: phash-row "114682730" didn't have a representation in the index_section table! on references page!
Example på alternativ søgning!
XHTML i frontend?
Backend modules:
- Proper skinning? / getLL? / XHTML
</diverse>
Implement stop-word setting in: ""Top-20 words by count:" and a list seperate from that (in main module?)
</diverse>
Test kaniner (indexed search / caching?):
- 3DS
......
......@@ -38,10 +38,9 @@
*
* 59: class tx_indexedsearch_pihook
* 72: function initialize_postProc()
* 82: function getResultRows($sWArr)
* 94: function prepareResultRowTemplateData_postProc($tmplContent, $row, $headerOnly)
* 95: function prepareResultRowTemplateData_postProc($tmplContent, $row, $headerOnly)
*