[!!!][BUGFIX] *_user table password field is to short
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.lexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Lexer for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
32 */
33
34
35
36
37
38
39
40
41
42
43
44
45 /**
46 * Lexer class for indexed_search
47 * A lexer splits the text into words
48 *
49 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
50 * @package TYPO3
51 * @subpackage tx_indexedsearch
52 */
53 class tx_indexedsearch_lexer {
54
55 // Debugging options:
56 var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
57 var $debugString = '';
58
59 /**
60 * Charset class object
61 *
62 * @var t3lib_cs
63 */
64 var $csObj;
65
66
67 // Configuration of the lexer:
68 var $lexerConf = array(
69 'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
70 0x2e, // "."
71 0x2d, // "-"
72 0x5f, // "_"
73 0x3a, // ":"
74 0x2f, // "/"
75 0x27, // "'"
76 ),
77 'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted.
78 'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
79 0x2d // "-"
80 )
81 );
82
83
84 /**
85 * Constructor: Initializes the charset class, t3lib_cs
86 *
87 * @return void
88 */
89 function __construct() {
90 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
91 }
92
93 /**
94 * Splitting string into words.
95 * Used for indexing, can also be used to find words in query.
96 *
97 * @param string String with UTF-8 content to process.
98 * @return array Array of words in utf-8
99 */
100 function split2Words($wordString) {
101
102 // Reset debug string:
103 $this->debugString = '';
104
105 // Then convert the string to lowercase:
106 if (!$this->lexerConf['casesensitive']) {
107 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
108 }
109
110 // Now, splitting words:
111 $len = 0;
112 $start = 0;
113 $pos = 0;
114 $words = array();
115 $this->debugString = '';
116
117 while(1) {
118 list($start, $len) = $this->get_word($wordString, $pos);
119 if ($len) {
120
121 $this->addWords($words, $wordString, $start, $len);
122
123 if ($this->debug) {
124 $this->debugString .= '<span style="color:red">' .
125 htmlspecialchars(substr($wordString, $pos, $start - $pos)) .
126 '</span>' .
127 htmlspecialchars(substr($wordString, $start, $len));
128 }
129
130 $pos = $start+$len;
131 } else {
132 break;
133 }
134 }
135 return $words;
136 }
137
138
139
140
141
142
143
144
145
146
147
148
149 /**********************************
150 *
151 * Helper functions
152 *
153 ********************************/
154
155
156 /**
157 * Add word to word-array
158 * This function should be used to make sure CJK sequences are split up in the right way
159 *
160 * @param array Array of accumulated words
161 * @param string Complete Input string from where to extract word
162 * @param integer Start position of word in input string
163 * @param integer The Length of the word string from start position
164 * @return void
165 */
166 function addWords(&$words, &$wordString, $start, $len) {
167
168 // Get word out of string:
169 $theWord = substr($wordString, $start, $len);
170
171 // Get next chars unicode number and find type:
172 $bc = 0;
173 $cp = $this->utf8_ord($theWord, $bc);
174 list($cType) = $this->charType($cp);
175
176 // If string is a CJK sequence we follow this algorithm:
177 /*
178 DESCRIPTION OF (CJK) ALGORITHM
179
180 Continuous letters and numbers make up words. Spaces and symbols
181 separate letters and numbers into words. This is sufficient for
182 all western text.
183
184 CJK doesn't use spaces or separators to separate words, so the only
185 way to really find out what constitutes a word would be to have a
186 dictionary and advanced heuristics. Instead, we form pairs from
187 consecutive characters, in such a way that searches will find only
188 characters that appear more-or-less the right sequence. For example:
189
190 ABCDE => AB BC CD DE
191
192 This works okay since both the index and the search query is split
193 in the same manner, and since the set of characters is huge so the
194 extra matches are not significant.
195
196 (Hint taken from ZOPEs chinese user group)
197
198 [Kasper: As far as I can see this will only work well with or-searches!]
199 */
200 if ($cType == 'cjk') {
201 // Find total string length:
202 $strlen = $this->csObj->utf8_strlen($theWord);
203
204 // Traverse string length and add words as pairs of two chars:
205 for ($a=0; $a<$strlen; $a++) {
206 if ($strlen==1 || $a<$strlen-1) {
207 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
208 }
209 }
210 } else { // Normal "single-byte" chars:
211 // Remove chars:
212 foreach($this->lexerConf['removeChars'] as $skipJoin) {
213 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
214 }
215 // Add word:
216 $words[] = $theWord;
217 }
218 }
219
220 /**
221 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
222 *
223 * @param string Input string (reference)
224 * @param integer Starting position in input string
225 * @return array 0: start, 1: len or FALSE if no word has been found
226 */
227 function get_word(&$str, $pos=0) {
228
229 $len=0;
230
231 // If return is TRUE, a word was found starting at this position, so returning position and length:
232 if ($this->utf8_is_letter($str, $len, $pos)) {
233 return array($pos, $len);
234 }
235
236 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
237 $pos += $len;
238 if ($str{$pos} == '') {
239 // Check end of string before looking for word of course.
240 return FALSE;
241 }
242
243 $this->utf8_is_letter($str, $len, $pos);
244 return array($pos, $len);
245 }
246
247 /**
248 * See if a character is a letter (or a string of letters or non-letters).
249 *
250 * @param string Input string (reference)
251 * @param integer Byte-length of character sequence (reference, return value)
252 * @param integer Starting position in input string
253 * @return boolean letter (or word) found
254 */
255 function utf8_is_letter(&$str, &$len, $pos=0) {
256 global $cs;
257
258 $len = 0;
259 $bc = 0;
260 $cType = $cType_prev = FALSE; // Letter type
261 $letter = TRUE; // looking for a letter?
262
263 if ($str{$pos} == '') {
264 // Return FALSE on end-of-string at this stage
265 return FALSE;
266 }
267
268 while(1) {
269
270 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
271 if ($len) {
272 if ($letter) { // We are in a sequence of words
273 if (!$cType // The char was NOT a letter
274 || ($cType_prev == 'cjk' && t3lib_div::inList('num,alpha', $cType)) || ($cType == 'cjk' && t3lib_div::inList('num,alpha', $cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets
275 ) {
276 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
277 if (!in_array($cp, $this->lexerConf['printjoins'])) {
278 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
279 if ($printJoinLgd) {
280 $len = $printJoinLgd;
281 }
282 return TRUE;
283 } else { // If a printJoin char is found, record the length if it has not been recorded already:
284 if (!$printJoinLgd) {
285 $printJoinLgd = $len;
286 }
287 }
288 } else { // When a true letter is found, reset printJoinLgd counter:
289 $printJoinLgd = 0;
290 }
291 } elseif (!$letter && $cType) { // end of non-word reached
292 return FALSE;
293 }
294 }
295 $len += $bc; // add byte-length of last found character
296
297 if ($str{$pos} == '') {
298 // End of string; return status of string till now
299 return $letter;
300 }
301
302 // Get next chars unicode number:
303 $cp = $this->utf8_ord($str, $bc, $pos);
304 $pos += $bc;
305
306 // Determine the type:
307 $cType_prev = $cType;
308 list($cType) = $this->charType($cp);
309 if ($cType) {
310 continue;
311 }
312
313 // Setting letter to FALSE if the first char was not a letter!
314 if (!$len) {
315 $letter = FALSE;
316 }
317 }
318
319 return FALSE;
320 }
321
322 /**
323 * Determine the type of character
324 *
325 * @param integer Unicode number to evaluate
326 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
327 */
328 function charType($cp) {
329
330 // Numeric?
331 if (
332 ($cp >= 0x30 && $cp <= 0x39) // Arabic
333 ) {
334 return array('num');
335 }
336
337 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
338 if (
339 ($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters
340 ($cp >= 0x61 && $cp <= 0x7A) || // Basic Latin: small letters
341 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
342 ($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B
343 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters
344 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) || // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
345 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) || // Hebrew: only accents and letters
346 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) || // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
347 ($cp >= 0x1E00 && $cp < 0x2000) // Latin Extended Additional and Greek Extended
348 ) {
349 return array('alpha');
350 }
351
352 // Looking for CJK (Chinese / Japanese / Korean)
353 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
354 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
355 if (
356 ($cp >= 0x3040 && $cp <= 0x30FF) || // HIRAGANA and KATAKANA letters
357 ($cp >= 0x3130 && $cp <= 0x318F) || // Hangul Compatibility Jamo
358 ($cp >= 0x3400 && $cp <= 0x4DBF) || // CJK Unified Ideographs Extension A
359 ($cp >= 0x4E00 && $cp <= 0x9FAF) || // CJK Unified Ideographs
360 ($cp >= 0xAC00 && $cp <= 0xD7AF) || // Hangul Syllables
361 ($cp >= 0x20000 && $cp <= 0x2FA1F) // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement
362 // also include CJK and Kangxi radicals or Bopomofo letter?
363 ) {
364 return array('cjk');
365 }
366 }
367
368 /**
369 * Converts a UTF-8 multibyte character to a UNICODE codepoint
370 *
371 * @param string UTF-8 multibyte character string (reference)
372 * @param integer The length of the character (reference, return value)
373 * @param integer Starting position in input string
374 * @param boolean If set, then a hex. number is returned
375 * @return integer UNICODE codepoint
376 */
377 function utf8_ord(&$str, &$len, $pos=0, $hex=FALSE) {
378 $ord = ord($str{$pos});
379 $len = 1;
380
381 if ($ord > 0x80) {
382 for ($bc = -1, $mbs = $ord; $mbs & 0x80; $mbs = $mbs << 1) {
383 // calculate number of extra bytes
384 $bc++;
385 }
386 $len += $bc;
387
388 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
389 // "bring in" data bytes
390 for ($i = $pos + 1; $bc; $bc--, $i++) {
391 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
392 }
393 }
394
395 return $hex ? 'x'.dechex($ord) : $ord;
396 }
397 }
398 ?>