10f6719a14cacab0d1b2ced3d4247864b5920fea
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.lexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Lexer for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
32 */
33
34
35
36
37
38
39
40
41
42
43
44
45 /**
46 * Lexer class for indexed_search
47 * A lexer splits the text into words
48 *
49 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
50 * @package TYPO3
51 * @subpackage tx_indexedsearch
52 */
53 class tx_indexedsearch_lexer {
54
55 // Debugging options:
56 var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
57 var $debugString = '';
58
59 /**
60 * Charset class object
61 *
62 * @var t3lib_cs
63 */
64 var $csObj;
65
66
67 // Configuration of the lexer:
68 var $lexerConf = array(
69 'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
70 0x2e, // "."
71 0x2d, // "-"
72 0x5f, // "_"
73 0x3a, // ":"
74 0x2f, // "/"
75 0x27, // "'"
76 ),
77 'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted.
78 'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
79 0x2d // "-"
80 )
81 );
82
83
84 /**
85 * Constructor: Initializes the charset class, t3lib_cs
86 *
87 * @return void
88 */
89 function __construct() {
90 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
91 }
92
93 /**
94 * Splitting string into words.
95 * Used for indexing, can also be used to find words in query.
96 *
97 * @param string String with UTF-8 content to process.
98 * @return array Array of words in utf-8
99 */
100 function split2Words($wordString) {
101
102 // Reset debug string:
103 $this->debugString = '';
104
105 // Then convert the string to lowercase:
106 if (!$this->lexerConf['casesensitive']) {
107 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
108 }
109
110 // Now, splitting words:
111 $len = 0;
112 $start = 0;
113 $pos = 0;
114 $words = array();
115 $this->debugString = '';
116
117 while(1) {
118 list($start, $len) = $this->get_word($wordString, $pos);
119 if ($len) {
120
121 $this->addWords($words, $wordString, $start, $len);
122
123 if ($this->debug) {
124 $this->debugString .= '<span style="color:red">' .
125 htmlspecialchars(substr($wordString, $pos, $start - $pos)) .
126 '</span>' .
127 htmlspecialchars(substr($wordString, $start, $len));
128 }
129
130 $pos = $start+$len;
131 } else break;
132 }
133 return $words;
134 }
135
136
137
138
139
140
141
142
143
144
145
146
147 /**********************************
148 *
149 * Helper functions
150 *
151 ********************************/
152
153
154 /**
155 * Add word to word-array
156 * This function should be used to make sure CJK sequences are split up in the right way
157 *
158 * @param array Array of accumulated words
159 * @param string Complete Input string from where to extract word
160 * @param integer Start position of word in input string
161 * @param integer The Length of the word string from start position
162 * @return void
163 */
164 function addWords(&$words, &$wordString, $start, $len) {
165
166 // Get word out of string:
167 $theWord = substr($wordString, $start, $len);
168
169 // Get next chars unicode number and find type:
170 $bc = 0;
171 $cp = $this->utf8_ord($theWord, $bc);
172 list($cType) = $this->charType($cp);
173
174 // If string is a CJK sequence we follow this algorithm:
175 /*
176 DESCRIPTION OF (CJK) ALGORITHM
177
178 Continuous letters and numbers make up words. Spaces and symbols
179 separate letters and numbers into words. This is sufficient for
180 all western text.
181
182 CJK doesn't use spaces or separators to separate words, so the only
183 way to really find out what constitutes a word would be to have a
184 dictionary and advanced heuristics. Instead, we form pairs from
185 consecutive characters, in such a way that searches will find only
186 characters that appear more-or-less the right sequence. For example:
187
188 ABCDE => AB BC CD DE
189
190 This works okay since both the index and the search query is split
191 in the same manner, and since the set of characters is huge so the
192 extra matches are not significant.
193
194 (Hint taken from ZOPEs chinese user group)
195
196 [Kasper: As far as I can see this will only work well with or-searches!]
197 */
198 if ($cType == 'cjk') {
199 // Find total string length:
200 $strlen = $this->csObj->utf8_strlen($theWord);
201
202 // Traverse string length and add words as pairs of two chars:
203 for ($a=0; $a<$strlen; $a++) {
204 if ($strlen==1 || $a<$strlen-1) {
205 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
206 }
207 }
208 } else { // Normal "single-byte" chars:
209 // Remove chars:
210 foreach($this->lexerConf['removeChars'] as $skipJoin) {
211 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
212 }
213 // Add word:
214 $words[] = $theWord;
215 }
216 }
217
218 /**
219 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
220 *
221 * @param string Input string (reference)
222 * @param integer Starting position in input string
223 * @return array 0: start, 1: len or FALSE if no word has been found
224 */
225 function get_word(&$str, $pos=0) {
226
227 $len=0;
228
229 // If return is TRUE, a word was found starting at this position, so returning position and length:
230 if ($this->utf8_is_letter($str, $len, $pos)) {
231 return array($pos, $len);
232 }
233
234 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
235 $pos += $len;
236 if ($str{$pos} == '') return FALSE; // check end of string before looking for word of course.
237
238 $this->utf8_is_letter($str, $len, $pos);
239 return array($pos, $len);
240 }
241
242 /**
243 * See if a character is a letter (or a string of letters or non-letters).
244 *
245 * @param string Input string (reference)
246 * @param integer Byte-length of character sequence (reference, return value)
247 * @param integer Starting position in input string
248 * @return boolean letter (or word) found
249 */
250 function utf8_is_letter(&$str, &$len, $pos=0) {
251 global $cs;
252
253 $len = 0;
254 $bc = 0;
255 $cType = $cType_prev = FALSE; // Letter type
256 $letter = TRUE; // looking for a letter?
257
258 if ($str{$pos} == '') return FALSE; // Return FALSE on end-of-string at this stage
259
260 while(1) {
261
262 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
263 if ($len) {
264 if ($letter) { // We are in a sequence of words
265 if (!$cType // The char was NOT a letter
266 || ($cType_prev == 'cjk' && t3lib_div::inList('num,alpha', $cType)) || ($cType == 'cjk' && t3lib_div::inList('num,alpha', $cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets
267 ) {
268 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
269 if (!in_array($cp, $this->lexerConf['printjoins'])) {
270 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
271 if ($printJoinLgd) {
272 $len = $printJoinLgd;
273 }
274 return TRUE;
275 } else { // If a printJoin char is found, record the length if it has not been recorded already:
276 if (!$printJoinLgd) $printJoinLgd = $len;
277 }
278 } else { // When a true letter is found, reset printJoinLgd counter:
279 $printJoinLgd = 0;
280 }
281 }
282 elseif (!$letter && $cType) { // end of non-word reached
283 return FALSE;
284 }
285 }
286 $len += $bc; // add byte-length of last found character
287
288 if ($str{$pos} == '') return $letter; // end of string; return status of string till now
289
290 // Get next chars unicode number:
291 $cp = $this->utf8_ord($str, $bc, $pos);
292 $pos += $bc;
293
294 // Determine the type:
295 $cType_prev = $cType;
296 list($cType) = $this->charType($cp);
297 if ($cType) {
298 continue;
299 }
300
301 // Setting letter to FALSE if the first char was not a letter!
302 if (!$len) $letter = FALSE;
303 }
304
305 return FALSE;
306 }
307
308 /**
309 * Determine the type of character
310 *
311 * @param integer Unicode number to evaluate
312 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
313 */
314 function charType($cp) {
315
316 // Numeric?
317 if (
318 ($cp >= 0x30 && $cp <= 0x39) // Arabic
319 ) {
320 return array('num');
321 }
322
323 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
324 if (
325 ($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters
326 ($cp >= 0x61 && $cp <= 0x7A) || // Basic Latin: small letters
327 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
328 ($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B
329 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters
330 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) || // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
331 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) || // Hebrew: only accents and letters
332 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) || // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
333 ($cp >= 0x1E00 && $cp < 0x2000) // Latin Extended Additional and Greek Extended
334 ) {
335 return array('alpha');
336 }
337
338 // Looking for CJK (Chinese / Japanese / Korean)
339 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
340 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
341 if (
342 ($cp >= 0x3040 && $cp <= 0x30FF) || // HIRAGANA and KATAKANA letters
343 ($cp >= 0x3130 && $cp <= 0x318F) || // Hangul Compatibility Jamo
344 ($cp >= 0x3400 && $cp <= 0x4DBF) || // CJK Unified Ideographs Extension A
345 ($cp >= 0x4E00 && $cp <= 0x9FAF) || // CJK Unified Ideographs
346 ($cp >= 0xAC00 && $cp <= 0xD7AF) || // Hangul Syllables
347 ($cp >= 0x20000 && $cp <= 0x2FA1F) // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement
348 // also include CJK and Kangxi radicals or Bopomofo letter?
349 ) {
350 return array('cjk');
351 }
352 }
353
354 /**
355 * Converts a UTF-8 multibyte character to a UNICODE codepoint
356 *
357 * @param string UTF-8 multibyte character string (reference)
358 * @param integer The length of the character (reference, return value)
359 * @param integer Starting position in input string
360 * @param boolean If set, then a hex. number is returned
361 * @return integer UNICODE codepoint
362 */
363 function utf8_ord(&$str, &$len, $pos=0, $hex=FALSE) {
364 $ord = ord($str{$pos});
365 $len = 1;
366
367 if ($ord > 0x80) {
368 for ($bc = -1, $mbs = $ord; $mbs & 0x80; $mbs = $mbs << 1) {
369 // calculate number of extra bytes
370 $bc++;
371 }
372 $len += $bc;
373
374 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
375 for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes
376 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
377 }
378
379 return $hex ? 'x'.dechex($ord) : $ord;
380 }
381 }
382 ?>