Added feature #4592: Keep indexed_search tables consistent when deleting/hiding pages...
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.lexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Lexer for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 73: class tx_indexedsearch_lexer
39 * 105: function tx_indexedsearch_lexer()
40 * 116: function split2Words($wordString)
41 *
42 * SECTION: Helper functions
43 * 178: function addWords(&$words, &$wordString, $start, $len)
44 * 239: function get_word(&$str, $pos=0)
45 * 264: function utf8_is_letter(&$str, &$len, $pos=0)
46 * 329: function charType($cp)
47 * 383: function utf8_ord(&$str, &$len, $pos=0, $hex=false)
48 *
49 * TOTAL FUNCTIONS: 7
50 * (This index is automatically created/updated by the extension "extdeveval")
51 *
52 */
53
54
55
56
57
58
59
60
61
62
63
64
65 /**
66 * Lexer class for indexed_search
67 * A lexer splits the text into words
68 *
69 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
70 * @package TYPO3
71 * @subpackage tx_indexedsearch
72 */
73 class tx_indexedsearch_lexer {
74
75 // Debugging options:
76 var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
77 var $debugString = '';
78
79 var $csObj; // Charset class object , t3lib_cs
80
81
82 // Configuration of the lexer:
83 var $lexerConf = array(
84 'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
85 0x2e, // "."
86 0x2d, // "-"
87 0x5f, // "_"
88 0x3a, // ":"
89 0x2f, // "/"
90 0x27, // "'"
91 // 0x615, // ARABIC SMALL HIGH TAH
92 ),
93 'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted.
94 'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
95 0x2d // "-"
96 )
97 );
98
99
100 /**
101 * Constructor: Initializes the charset class, t3lib_cs
102 *
103 * @return void
104 */
105 function tx_indexedsearch_lexer() {
106 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
107 }
108
109 /**
110 * Splitting string into words.
111 * Used for indexing, can also be used to find words in query.
112 *
113 * @param string String with UTF-8 content to process.
114 * @return array Array of words in utf-8
115 */
116 function split2Words($wordString) {
117
118 // Reset debug string:
119 $this->debugString = '';
120
121 // Then convert the string to lowercase:
122 if (!$this->lexerConf['casesensitive']) {
123 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
124 }
125
126 // Now, splitting words:
127 $len = 0;
128 $start = 0;
129 $pos = 0;
130 $words = array();
131 $this->debugString = '';
132
133 while(1) {
134 list($start,$len) = $this->get_word($wordString, $pos);
135 if ($len) {
136
137 $this->addWords($words, $wordString,$start,$len);
138
139 if ($this->debug) {
140 $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
141 htmlspecialchars(substr($wordString,$start,$len));
142 }
143
144 $pos = $start+$len;
145 } else break;
146 }
147 return $words;
148 }
149
150
151
152
153
154
155
156
157
158
159
160
161 /**********************************
162 *
163 * Helper functions
164 *
165 ********************************/
166
167
168 /**
169 * Add word to word-array
170 * This function should be used to make sure CJK sequences are split up in the right way
171 *
172 * @param array Array of accumulated words
173 * @param string Complete Input string from where to extract word
174 * @param integer Start position of word in input string
175 * @param integer The Length of the word string from start position
176 * @return void
177 */
178 function addWords(&$words, &$wordString, $start, $len) {
179
180 // Get word out of string:
181 $theWord = substr($wordString,$start,$len);
182
183 // Get next chars unicode number and find type:
184 $bc = 0;
185 $cp = $this->utf8_ord($theWord, $bc);
186 list($cType) = $this->charType($cp);
187
188 // If string is a CJK sequence we follow this algorithm:
189 /*
190 DESCRIPTION OF (CJK) ALGORITHM
191
192 Continuous letters and numbers make up words. Spaces and symbols
193 separate letters and numbers into words. This is sufficient for
194 all western text.
195
196 CJK doesn't use spaces or separators to separate words, so the only
197 way to really find out what constitutes a word would be to have a
198 dictionary and advanced heuristics. Instead, we form pairs from
199 consecutive characters, in such a way that searches will find only
200 characters that appear more-or-less the right sequence. For example:
201
202 ABCDE => AB BC CD DE
203
204 This works okay since both the index and the search query is split
205 in the same manner, and since the set of characters is huge so the
206 extra matches are not significant.
207
208 (Hint taken from ZOPEs chinese user group)
209
210 [Kasper: As far as I can see this will only work well with or-searches!]
211 */
212 if ($cType == 'cjk') {
213 // Find total string length:
214 $strlen = $this->csObj->utf8_strlen($theWord);
215
216 // Traverse string length and add words as pairs of two chars:
217 for ($a=0; $a<$strlen; $a++) {
218 if ($strlen==1 || $a<$strlen-1) {
219 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
220 }
221 }
222 } else { // Normal "single-byte" chars:
223 // Remove chars:
224 foreach($this->lexerConf['removeChars'] as $skipJoin) {
225 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
226 }
227 // Add word:
228 $words[] = $theWord;
229 }
230 }
231
232 /**
233 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
234 *
235 * @param string Input string (reference)
236 * @param integer Starting position in input string
237 * @return array 0: start, 1: len or false if no word has been found
238 */
239 function get_word(&$str, $pos=0) {
240
241 $len=0;
242
243 // If return is true, a word was found starting at this position, so returning position and length:
244 if ($this->utf8_is_letter($str, $len, $pos)) {
245 return array($pos,$len);
246 }
247
248 // If the return value was false it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
249 $pos += $len;
250 if ($str{$pos} == '') return false; // check end of string before looking for word of course.
251
252 $this->utf8_is_letter($str, $len, $pos);
253 return array($pos,$len);
254 }
255
256 /**
257 * See if a character is a letter (or a string of letters or non-letters).
258 *
259 * @param string Input string (reference)
260 * @param integer Byte-length of character sequence (reference, return value)
261 * @param integer Starting position in input string
262 * @return boolean letter (or word) found
263 */
264 function utf8_is_letter(&$str, &$len, $pos=0) {
265 global $cs;
266
267 $len = 0;
268 $bc = 0;
269 $cType = $cType_prev = false; // Letter type
270 $letter = true; // looking for a letter?
271
272 if ($str{$pos} == '') return false; // Return false on end-of-string at this stage
273
274 while(1) {
275
276 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
277 if ($len) {
278 if ($letter) { // We are in a sequence of words
279 if (!$cType // The char was NOT a letter
280 || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets
281 ) {
282 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
283 if (!in_array($cp,$this->lexerConf['printjoins'])) {
284 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
285 if ($printJoinLgd) {
286 $len = $printJoinLgd;
287 }
288 #debug($cp);
289 return true;
290 } else { // If a printJoin char is found, record the length if it has not been recorded already:
291 if (!$printJoinLgd) $printJoinLgd = $len;
292 }
293 } else { // When a true letter is found, reset printJoinLgd counter:
294 $printJoinLgd = 0;
295 }
296 }
297 elseif (!$letter && $cType) { // end of non-word reached
298 return false;
299 }
300 }
301 $len += $bc; // add byte-length of last found character
302
303 if ($str{$pos} == '') return $letter; // end of string; return status of string till now
304
305 // Get next chars unicode number:
306 $cp = $this->utf8_ord($str,$bc,$pos);
307 $pos += $bc;
308
309 // Determine the type:
310 $cType_prev = $cType;
311 list($cType) = $this->charType($cp);
312 if ($cType) {
313 continue;
314 }
315
316 // Setting letter to false if the first char was not a letter!
317 if (!$len) $letter = false;
318 }
319
320 return false;
321 }
322
323 /**
324 * Determine the type of character
325 *
326 * @param integer Unicode number to evaluate
327 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
328 */
329 function charType($cp) {
330
331 // Numeric?
332 if (
333 ($cp >= 0x30 && $cp <= 0x39) // Arabic
334 /*
335 ($cp >= 0x660 && $cp <= 0x669) || // Arabic-Indic
336 ($cp >= 0x6F0 && $cp <= 0x6F9) || // Arabic-Indic (Iran, Pakistan, and India)
337 ($cp >= 0x3021 && $cp <= 0x3029) || // Hangzhou
338 */
339 ) {
340 return array('num');
341 }
342
343 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
344 if (
345 ($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters
346 ($cp >= 0x61 && $cp <= 0x7A) || // Basic Latin: small letters
347 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
348 ($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B
349 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters
350 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) || // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
351 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) || // Hebrew: only accents and letters
352 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) || // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
353 ($cp >= 0x1E00 && $cp < 0x2000) // Latin Extended Additional and Greek Extended
354 ) {
355 return array('alpha');
356 }
357
358 // Looking for CJK (Chinese / Japanese / Korean)
359 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
360 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
361 if (
362 ($cp >= 0x3040 && $cp <= 0x30FF) || // HIRAGANA and KATAKANA letters
363 ($cp >= 0x3130 && $cp <= 0x318F) || // Hangul Compatibility Jamo
364 ($cp >= 0x3400 && $cp <= 0x4DBF) || // CJK Unified Ideographs Extension A
365 ($cp >= 0x4E00 && $cp <= 0x9FAF) || // CJK Unified Ideographs
366 ($cp >= 0xAC00 && $cp <= 0xD7AF) || // Hangul Syllables
367 ($cp >= 0x20000 && $cp <= 0x2FA1F) // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement
368 // also include CJK and Kangxi radicals or Bopomofo letter?
369 ) {
370 return array('cjk');
371 }
372 }
373
374 /**
375 * Converts a UTF-8 multibyte character to a UNICODE codepoint
376 *
377 * @param string UTF-8 multibyte character string (reference)
378 * @param integer The length of the character (reference, return value)
379 * @param integer Starting position in input string
380 * @param boolean If set, then a hex. number is returned
381 * @return integer UNICODE codepoint
382 */
383 function utf8_ord(&$str, &$len, $pos=0, $hex=false) {
384 $ord = ord($str{$pos});
385 $len = 1;
386
387 if ($ord > 0x80) {
388 for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of extra bytes
389 $len += $bc;
390
391 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
392 for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes
393 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
394 }
395
396 return $hex ? 'x'.dechex($ord) : $ord;
397 }
398 }
399
400
401 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']) {
402 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
403 }
404 ?>