[API][!!!] Fix API compatibility break
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.lexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Lexer for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
32 */
33
34
35
36
37
38
39
40
41
42
43
44
45 /**
46 * Lexer class for indexed_search
47 * A lexer splits the text into words
48 *
49 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
50 * @package TYPO3
51 * @subpackage tx_indexedsearch
52 */
53 class tx_indexedsearch_lexer {
54
55 // Debugging options:
56 var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
57 var $debugString = '';
58
59 /**
60 * Charset class object
61 *
62 * @var t3lib_cs
63 */
64 var $csObj;
65
66
67 // Configuration of the lexer:
68 var $lexerConf = array(
69 'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
70 0x2e, // "."
71 0x2d, // "-"
72 0x5f, // "_"
73 0x3a, // ":"
74 0x2f, // "/"
75 0x27, // "'"
76 // 0x615, // ARABIC SMALL HIGH TAH
77 ),
78 'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted.
79 'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
80 0x2d // "-"
81 )
82 );
83
84
85 /**
86 * Constructor: Initializes the charset class, t3lib_cs
87 *
88 * @return void
89 */
90 function __construct() {
91 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
92 }
93
94 /**
95 * Compatibility constructor.
96 *
97 * @deprecated since TYPO3 4.6 and will be removed in TYPO3 4.8. Use __construct() instead.
98 */
99 public function tx_indexedsearch_lexer() {
100 t3lib_div::logDeprecatedFunction();
101 // Note: we cannot call $this->__construct() here because it would call the derived class constructor and cause recursion
102 // This code uses official PHP behavior (http://www.php.net/manual/en/language.oop5.basic.php) when $this in the
103 // statically called non-static method inherits $this from the caller's scope.
104 tx_indexedsearch_lexer::__construct();
105 }
106
107 /**
108 * Splitting string into words.
109 * Used for indexing, can also be used to find words in query.
110 *
111 * @param string String with UTF-8 content to process.
112 * @return array Array of words in utf-8
113 */
114 function split2Words($wordString) {
115
116 // Reset debug string:
117 $this->debugString = '';
118
119 // Then convert the string to lowercase:
120 if (!$this->lexerConf['casesensitive']) {
121 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
122 }
123
124 // Now, splitting words:
125 $len = 0;
126 $start = 0;
127 $pos = 0;
128 $words = array();
129 $this->debugString = '';
130
131 while(1) {
132 list($start,$len) = $this->get_word($wordString, $pos);
133 if ($len) {
134
135 $this->addWords($words, $wordString,$start,$len);
136
137 if ($this->debug) {
138 $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
139 htmlspecialchars(substr($wordString,$start,$len));
140 }
141
142 $pos = $start+$len;
143 } else break;
144 }
145 return $words;
146 }
147
148
149
150
151
152
153
154
155
156
157
158
159 /**********************************
160 *
161 * Helper functions
162 *
163 ********************************/
164
165
166 /**
167 * Add word to word-array
168 * This function should be used to make sure CJK sequences are split up in the right way
169 *
170 * @param array Array of accumulated words
171 * @param string Complete Input string from where to extract word
172 * @param integer Start position of word in input string
173 * @param integer The Length of the word string from start position
174 * @return void
175 */
176 function addWords(&$words, &$wordString, $start, $len) {
177
178 // Get word out of string:
179 $theWord = substr($wordString,$start,$len);
180
181 // Get next chars unicode number and find type:
182 $bc = 0;
183 $cp = $this->utf8_ord($theWord, $bc);
184 list($cType) = $this->charType($cp);
185
186 // If string is a CJK sequence we follow this algorithm:
187 /*
188 DESCRIPTION OF (CJK) ALGORITHM
189
190 Continuous letters and numbers make up words. Spaces and symbols
191 separate letters and numbers into words. This is sufficient for
192 all western text.
193
194 CJK doesn't use spaces or separators to separate words, so the only
195 way to really find out what constitutes a word would be to have a
196 dictionary and advanced heuristics. Instead, we form pairs from
197 consecutive characters, in such a way that searches will find only
198 characters that appear more-or-less the right sequence. For example:
199
200 ABCDE => AB BC CD DE
201
202 This works okay since both the index and the search query is split
203 in the same manner, and since the set of characters is huge so the
204 extra matches are not significant.
205
206 (Hint taken from ZOPEs chinese user group)
207
208 [Kasper: As far as I can see this will only work well with or-searches!]
209 */
210 if ($cType == 'cjk') {
211 // Find total string length:
212 $strlen = $this->csObj->utf8_strlen($theWord);
213
214 // Traverse string length and add words as pairs of two chars:
215 for ($a=0; $a<$strlen; $a++) {
216 if ($strlen==1 || $a<$strlen-1) {
217 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
218 }
219 }
220 } else { // Normal "single-byte" chars:
221 // Remove chars:
222 foreach($this->lexerConf['removeChars'] as $skipJoin) {
223 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
224 }
225 // Add word:
226 $words[] = $theWord;
227 }
228 }
229
230 /**
231 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
232 *
233 * @param string Input string (reference)
234 * @param integer Starting position in input string
235 * @return array 0: start, 1: len or FALSE if no word has been found
236 */
237 function get_word(&$str, $pos=0) {
238
239 $len=0;
240
241 // If return is TRUE, a word was found starting at this position, so returning position and length:
242 if ($this->utf8_is_letter($str, $len, $pos)) {
243 return array($pos,$len);
244 }
245
246 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
247 $pos += $len;
248 if ($str{$pos} == '') return FALSE; // check end of string before looking for word of course.
249
250 $this->utf8_is_letter($str, $len, $pos);
251 return array($pos,$len);
252 }
253
254 /**
255 * See if a character is a letter (or a string of letters or non-letters).
256 *
257 * @param string Input string (reference)
258 * @param integer Byte-length of character sequence (reference, return value)
259 * @param integer Starting position in input string
260 * @return boolean letter (or word) found
261 */
262 function utf8_is_letter(&$str, &$len, $pos=0) {
263 global $cs;
264
265 $len = 0;
266 $bc = 0;
267 $cType = $cType_prev = FALSE; // Letter type
268 $letter = TRUE; // looking for a letter?
269
270 if ($str{$pos} == '') return FALSE; // Return FALSE on end-of-string at this stage
271
272 while(1) {
273
274 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
275 if ($len) {
276 if ($letter) { // We are in a sequence of words
277 if (!$cType // The char was NOT a letter
278 || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets
279 ) {
280 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
281 if (!in_array($cp,$this->lexerConf['printjoins'])) {
282 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
283 if ($printJoinLgd) {
284 $len = $printJoinLgd;
285 }
286 #debug($cp);
287 return TRUE;
288 } else { // If a printJoin char is found, record the length if it has not been recorded already:
289 if (!$printJoinLgd) $printJoinLgd = $len;
290 }
291 } else { // When a true letter is found, reset printJoinLgd counter:
292 $printJoinLgd = 0;
293 }
294 }
295 elseif (!$letter && $cType) { // end of non-word reached
296 return FALSE;
297 }
298 }
299 $len += $bc; // add byte-length of last found character
300
301 if ($str{$pos} == '') return $letter; // end of string; return status of string till now
302
303 // Get next chars unicode number:
304 $cp = $this->utf8_ord($str,$bc,$pos);
305 $pos += $bc;
306
307 // Determine the type:
308 $cType_prev = $cType;
309 list($cType) = $this->charType($cp);
310 if ($cType) {
311 continue;
312 }
313
314 // Setting letter to FALSE if the first char was not a letter!
315 if (!$len) $letter = FALSE;
316 }
317
318 return FALSE;
319 }
320
321 /**
322 * Determine the type of character
323 *
324 * @param integer Unicode number to evaluate
325 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
326 */
327 function charType($cp) {
328
329 // Numeric?
330 if (
331 ($cp >= 0x30 && $cp <= 0x39) // Arabic
332 /*
333 ($cp >= 0x660 && $cp <= 0x669) || // Arabic-Indic
334 ($cp >= 0x6F0 && $cp <= 0x6F9) || // Arabic-Indic (Iran, Pakistan, and India)
335 ($cp >= 0x3021 && $cp <= 0x3029) || // Hangzhou
336 */
337 ) {
338 return array('num');
339 }
340
341 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
342 if (
343 ($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters
344 ($cp >= 0x61 && $cp <= 0x7A) || // Basic Latin: small letters
345 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
346 ($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B
347 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters
348 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) || // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
349 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) || // Hebrew: only accents and letters
350 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) || // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
351 ($cp >= 0x1E00 && $cp < 0x2000) // Latin Extended Additional and Greek Extended
352 ) {
353 return array('alpha');
354 }
355
356 // Looking for CJK (Chinese / Japanese / Korean)
357 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
358 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
359 if (
360 ($cp >= 0x3040 && $cp <= 0x30FF) || // HIRAGANA and KATAKANA letters
361 ($cp >= 0x3130 && $cp <= 0x318F) || // Hangul Compatibility Jamo
362 ($cp >= 0x3400 && $cp <= 0x4DBF) || // CJK Unified Ideographs Extension A
363 ($cp >= 0x4E00 && $cp <= 0x9FAF) || // CJK Unified Ideographs
364 ($cp >= 0xAC00 && $cp <= 0xD7AF) || // Hangul Syllables
365 ($cp >= 0x20000 && $cp <= 0x2FA1F) // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement
366 // also include CJK and Kangxi radicals or Bopomofo letter?
367 ) {
368 return array('cjk');
369 }
370 }
371
372 /**
373 * Converts a UTF-8 multibyte character to a UNICODE codepoint
374 *
375 * @param string UTF-8 multibyte character string (reference)
376 * @param integer The length of the character (reference, return value)
377 * @param integer Starting position in input string
378 * @param boolean If set, then a hex. number is returned
379 * @return integer UNICODE codepoint
380 */
381 function utf8_ord(&$str, &$len, $pos=0, $hex=FALSE) {
382 $ord = ord($str{$pos});
383 $len = 1;
384
385 if ($ord > 0x80) {
386 for ($bc = -1, $mbs = $ord; $mbs & 0x80; $mbs = $mbs << 1) {
387 // calculate number of extra bytes
388 $bc++;
389 }
390 $len += $bc;
391
392 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
393 for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes
394 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
395 }
396
397 return $hex ? 'x'.dechex($ord) : $ord;
398 }
399 }
400
401
402 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php'])) {
403 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
404 }
405 ?>