[TASK] Remove function index
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.lexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Lexer for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
32 */
33
34
35
36
37
38
39
40
41
42
43
44
45 /**
46 * Lexer class for indexed_search
47 * A lexer splits the text into words
48 *
49 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
50 * @package TYPO3
51 * @subpackage tx_indexedsearch
52 */
53 class tx_indexedsearch_lexer {
54
55 // Debugging options:
56 var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
57 var $debugString = '';
58
59 /**
60 * Charset class object
61 *
62 * @var t3lib_cs
63 */
64 var $csObj;
65
66
67 // Configuration of the lexer:
68 var $lexerConf = array(
69 'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
70 0x2e, // "."
71 0x2d, // "-"
72 0x5f, // "_"
73 0x3a, // ":"
74 0x2f, // "/"
75 0x27, // "'"
76 // 0x615, // ARABIC SMALL HIGH TAH
77 ),
78 'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted.
79 'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
80 0x2d // "-"
81 )
82 );
83
84
85 /**
86 * Constructor: Initializes the charset class, t3lib_cs
87 *
88 * @return void
89 */
90 function __construct() {
91 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
92 }
93
94 /**
95 * Splitting string into words.
96 * Used for indexing, can also be used to find words in query.
97 *
98 * @param string String with UTF-8 content to process.
99 * @return array Array of words in utf-8
100 */
101 function split2Words($wordString) {
102
103 // Reset debug string:
104 $this->debugString = '';
105
106 // Then convert the string to lowercase:
107 if (!$this->lexerConf['casesensitive']) {
108 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
109 }
110
111 // Now, splitting words:
112 $len = 0;
113 $start = 0;
114 $pos = 0;
115 $words = array();
116 $this->debugString = '';
117
118 while(1) {
119 list($start,$len) = $this->get_word($wordString, $pos);
120 if ($len) {
121
122 $this->addWords($words, $wordString,$start,$len);
123
124 if ($this->debug) {
125 $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
126 htmlspecialchars(substr($wordString,$start,$len));
127 }
128
129 $pos = $start+$len;
130 } else break;
131 }
132 return $words;
133 }
134
135
136
137
138
139
140
141
142
143
144
145
146 /**********************************
147 *
148 * Helper functions
149 *
150 ********************************/
151
152
153 /**
154 * Add word to word-array
155 * This function should be used to make sure CJK sequences are split up in the right way
156 *
157 * @param array Array of accumulated words
158 * @param string Complete Input string from where to extract word
159 * @param integer Start position of word in input string
160 * @param integer The Length of the word string from start position
161 * @return void
162 */
163 function addWords(&$words, &$wordString, $start, $len) {
164
165 // Get word out of string:
166 $theWord = substr($wordString,$start,$len);
167
168 // Get next chars unicode number and find type:
169 $bc = 0;
170 $cp = $this->utf8_ord($theWord, $bc);
171 list($cType) = $this->charType($cp);
172
173 // If string is a CJK sequence we follow this algorithm:
174 /*
175 DESCRIPTION OF (CJK) ALGORITHM
176
177 Continuous letters and numbers make up words. Spaces and symbols
178 separate letters and numbers into words. This is sufficient for
179 all western text.
180
181 CJK doesn't use spaces or separators to separate words, so the only
182 way to really find out what constitutes a word would be to have a
183 dictionary and advanced heuristics. Instead, we form pairs from
184 consecutive characters, in such a way that searches will find only
185 characters that appear more-or-less the right sequence. For example:
186
187 ABCDE => AB BC CD DE
188
189 This works okay since both the index and the search query is split
190 in the same manner, and since the set of characters is huge so the
191 extra matches are not significant.
192
193 (Hint taken from ZOPEs chinese user group)
194
195 [Kasper: As far as I can see this will only work well with or-searches!]
196 */
197 if ($cType == 'cjk') {
198 // Find total string length:
199 $strlen = $this->csObj->utf8_strlen($theWord);
200
201 // Traverse string length and add words as pairs of two chars:
202 for ($a=0; $a<$strlen; $a++) {
203 if ($strlen==1 || $a<$strlen-1) {
204 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
205 }
206 }
207 } else { // Normal "single-byte" chars:
208 // Remove chars:
209 foreach($this->lexerConf['removeChars'] as $skipJoin) {
210 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
211 }
212 // Add word:
213 $words[] = $theWord;
214 }
215 }
216
217 /**
218 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
219 *
220 * @param string Input string (reference)
221 * @param integer Starting position in input string
222 * @return array 0: start, 1: len or FALSE if no word has been found
223 */
224 function get_word(&$str, $pos=0) {
225
226 $len=0;
227
228 // If return is TRUE, a word was found starting at this position, so returning position and length:
229 if ($this->utf8_is_letter($str, $len, $pos)) {
230 return array($pos,$len);
231 }
232
233 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
234 $pos += $len;
235 if ($str{$pos} == '') return FALSE; // check end of string before looking for word of course.
236
237 $this->utf8_is_letter($str, $len, $pos);
238 return array($pos,$len);
239 }
240
241 /**
242 * See if a character is a letter (or a string of letters or non-letters).
243 *
244 * @param string Input string (reference)
245 * @param integer Byte-length of character sequence (reference, return value)
246 * @param integer Starting position in input string
247 * @return boolean letter (or word) found
248 */
249 function utf8_is_letter(&$str, &$len, $pos=0) {
250 global $cs;
251
252 $len = 0;
253 $bc = 0;
254 $cType = $cType_prev = FALSE; // Letter type
255 $letter = TRUE; // looking for a letter?
256
257 if ($str{$pos} == '') return FALSE; // Return FALSE on end-of-string at this stage
258
259 while(1) {
260
261 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
262 if ($len) {
263 if ($letter) { // We are in a sequence of words
264 if (!$cType // The char was NOT a letter
265 || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets
266 ) {
267 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
268 if (!in_array($cp,$this->lexerConf['printjoins'])) {
269 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
270 if ($printJoinLgd) {
271 $len = $printJoinLgd;
272 }
273 #debug($cp);
274 return TRUE;
275 } else { // If a printJoin char is found, record the length if it has not been recorded already:
276 if (!$printJoinLgd) $printJoinLgd = $len;
277 }
278 } else { // When a true letter is found, reset printJoinLgd counter:
279 $printJoinLgd = 0;
280 }
281 }
282 elseif (!$letter && $cType) { // end of non-word reached
283 return FALSE;
284 }
285 }
286 $len += $bc; // add byte-length of last found character
287
288 if ($str{$pos} == '') return $letter; // end of string; return status of string till now
289
290 // Get next chars unicode number:
291 $cp = $this->utf8_ord($str,$bc,$pos);
292 $pos += $bc;
293
294 // Determine the type:
295 $cType_prev = $cType;
296 list($cType) = $this->charType($cp);
297 if ($cType) {
298 continue;
299 }
300
301 // Setting letter to FALSE if the first char was not a letter!
302 if (!$len) $letter = FALSE;
303 }
304
305 return FALSE;
306 }
307
308 /**
309 * Determine the type of character
310 *
311 * @param integer Unicode number to evaluate
312 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
313 */
314 function charType($cp) {
315
316 // Numeric?
317 if (
318 ($cp >= 0x30 && $cp <= 0x39) // Arabic
319 /*
320 ($cp >= 0x660 && $cp <= 0x669) || // Arabic-Indic
321 ($cp >= 0x6F0 && $cp <= 0x6F9) || // Arabic-Indic (Iran, Pakistan, and India)
322 ($cp >= 0x3021 && $cp <= 0x3029) || // Hangzhou
323 */
324 ) {
325 return array('num');
326 }
327
328 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
329 if (
330 ($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters
331 ($cp >= 0x61 && $cp <= 0x7A) || // Basic Latin: small letters
332 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
333 ($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B
334 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters
335 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) || // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
336 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) || // Hebrew: only accents and letters
337 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) || // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
338 ($cp >= 0x1E00 && $cp < 0x2000) // Latin Extended Additional and Greek Extended
339 ) {
340 return array('alpha');
341 }
342
343 // Looking for CJK (Chinese / Japanese / Korean)
344 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
345 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
346 if (
347 ($cp >= 0x3040 && $cp <= 0x30FF) || // HIRAGANA and KATAKANA letters
348 ($cp >= 0x3130 && $cp <= 0x318F) || // Hangul Compatibility Jamo
349 ($cp >= 0x3400 && $cp <= 0x4DBF) || // CJK Unified Ideographs Extension A
350 ($cp >= 0x4E00 && $cp <= 0x9FAF) || // CJK Unified Ideographs
351 ($cp >= 0xAC00 && $cp <= 0xD7AF) || // Hangul Syllables
352 ($cp >= 0x20000 && $cp <= 0x2FA1F) // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement
353 // also include CJK and Kangxi radicals or Bopomofo letter?
354 ) {
355 return array('cjk');
356 }
357 }
358
359 /**
360 * Converts a UTF-8 multibyte character to a UNICODE codepoint
361 *
362 * @param string UTF-8 multibyte character string (reference)
363 * @param integer The length of the character (reference, return value)
364 * @param integer Starting position in input string
365 * @param boolean If set, then a hex. number is returned
366 * @return integer UNICODE codepoint
367 */
368 function utf8_ord(&$str, &$len, $pos=0, $hex=FALSE) {
369 $ord = ord($str{$pos});
370 $len = 1;
371
372 if ($ord > 0x80) {
373 for ($bc = -1, $mbs = $ord; $mbs & 0x80; $mbs = $mbs << 1) {
374 // calculate number of extra bytes
375 $bc++;
376 }
377 $len += $bc;
378
379 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
380 for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes
381 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
382 }
383
384 return $hex ? 'x'.dechex($ord) : $ord;
385 }
386 }
387
388
389 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php'])) {
390 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
391 }
392 ?>