1b23144183a7152885d6b700ce722cdca0b30b1a
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.lexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2010 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Lexer for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 73: class tx_indexedsearch_lexer
39 * 105: function tx_indexedsearch_lexer()
40 * 116: function split2Words($wordString)
41 *
42 * SECTION: Helper functions
43 * 178: function addWords(&$words, &$wordString, $start, $len)
44 * 239: function get_word(&$str, $pos=0)
45 * 264: function utf8_is_letter(&$str, &$len, $pos=0)
46 * 329: function charType($cp)
47 * 383: function utf8_ord(&$str, &$len, $pos=0, $hex=false)
48 *
49 * TOTAL FUNCTIONS: 7
50 * (This index is automatically created/updated by the extension "extdeveval")
51 *
52 */
53
54
55
56
57
58
59
60
61
62
63
64
65 /**
66 * Lexer class for indexed_search
67 * A lexer splits the text into words
68 *
69 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
70 * @package TYPO3
71 * @subpackage tx_indexedsearch
72 */
73 class tx_indexedsearch_lexer {
74
75 // Debugging options:
76 var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
77 var $debugString = '';
78
79 /**
80 * Charset class object
81 *
82 * @var t3lib_cs
83 */
84 var $csObj;
85
86
87 // Configuration of the lexer:
88 var $lexerConf = array(
89 'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
90 0x2e, // "."
91 0x2d, // "-"
92 0x5f, // "_"
93 0x3a, // ":"
94 0x2f, // "/"
95 0x27, // "'"
96 // 0x615, // ARABIC SMALL HIGH TAH
97 ),
98 'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted.
99 'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
100 0x2d // "-"
101 )
102 );
103
104
105 /**
106 * Constructor: Initializes the charset class, t3lib_cs
107 *
108 * @return void
109 */
110 function tx_indexedsearch_lexer() {
111 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
112 }
113
114 /**
115 * Splitting string into words.
116 * Used for indexing, can also be used to find words in query.
117 *
118 * @param string String with UTF-8 content to process.
119 * @return array Array of words in utf-8
120 */
121 function split2Words($wordString) {
122
123 // Reset debug string:
124 $this->debugString = '';
125
126 // Then convert the string to lowercase:
127 if (!$this->lexerConf['casesensitive']) {
128 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
129 }
130
131 // Now, splitting words:
132 $len = 0;
133 $start = 0;
134 $pos = 0;
135 $words = array();
136 $this->debugString = '';
137
138 while(1) {
139 list($start,$len) = $this->get_word($wordString, $pos);
140 if ($len) {
141
142 $this->addWords($words, $wordString,$start,$len);
143
144 if ($this->debug) {
145 $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
146 htmlspecialchars(substr($wordString,$start,$len));
147 }
148
149 $pos = $start+$len;
150 } else break;
151 }
152 return $words;
153 }
154
155
156
157
158
159
160
161
162
163
164
165
166 /**********************************
167 *
168 * Helper functions
169 *
170 ********************************/
171
172
173 /**
174 * Add word to word-array
175 * This function should be used to make sure CJK sequences are split up in the right way
176 *
177 * @param array Array of accumulated words
178 * @param string Complete Input string from where to extract word
179 * @param integer Start position of word in input string
180 * @param integer The Length of the word string from start position
181 * @return void
182 */
183 function addWords(&$words, &$wordString, $start, $len) {
184
185 // Get word out of string:
186 $theWord = substr($wordString,$start,$len);
187
188 // Get next chars unicode number and find type:
189 $bc = 0;
190 $cp = $this->utf8_ord($theWord, $bc);
191 list($cType) = $this->charType($cp);
192
193 // If string is a CJK sequence we follow this algorithm:
194 /*
195 DESCRIPTION OF (CJK) ALGORITHM
196
197 Continuous letters and numbers make up words. Spaces and symbols
198 separate letters and numbers into words. This is sufficient for
199 all western text.
200
201 CJK doesn't use spaces or separators to separate words, so the only
202 way to really find out what constitutes a word would be to have a
203 dictionary and advanced heuristics. Instead, we form pairs from
204 consecutive characters, in such a way that searches will find only
205 characters that appear more-or-less the right sequence. For example:
206
207 ABCDE => AB BC CD DE
208
209 This works okay since both the index and the search query is split
210 in the same manner, and since the set of characters is huge so the
211 extra matches are not significant.
212
213 (Hint taken from ZOPEs chinese user group)
214
215 [Kasper: As far as I can see this will only work well with or-searches!]
216 */
217 if ($cType == 'cjk') {
218 // Find total string length:
219 $strlen = $this->csObj->utf8_strlen($theWord);
220
221 // Traverse string length and add words as pairs of two chars:
222 for ($a=0; $a<$strlen; $a++) {
223 if ($strlen==1 || $a<$strlen-1) {
224 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
225 }
226 }
227 } else { // Normal "single-byte" chars:
228 // Remove chars:
229 foreach($this->lexerConf['removeChars'] as $skipJoin) {
230 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
231 }
232 // Add word:
233 $words[] = $theWord;
234 }
235 }
236
237 /**
238 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
239 *
240 * @param string Input string (reference)
241 * @param integer Starting position in input string
242 * @return array 0: start, 1: len or false if no word has been found
243 */
244 function get_word(&$str, $pos=0) {
245
246 $len=0;
247
248 // If return is true, a word was found starting at this position, so returning position and length:
249 if ($this->utf8_is_letter($str, $len, $pos)) {
250 return array($pos,$len);
251 }
252
253 // If the return value was false it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
254 $pos += $len;
255 if ($str{$pos} == '') return false; // check end of string before looking for word of course.
256
257 $this->utf8_is_letter($str, $len, $pos);
258 return array($pos,$len);
259 }
260
261 /**
262 * See if a character is a letter (or a string of letters or non-letters).
263 *
264 * @param string Input string (reference)
265 * @param integer Byte-length of character sequence (reference, return value)
266 * @param integer Starting position in input string
267 * @return boolean letter (or word) found
268 */
269 function utf8_is_letter(&$str, &$len, $pos=0) {
270 global $cs;
271
272 $len = 0;
273 $bc = 0;
274 $cType = $cType_prev = false; // Letter type
275 $letter = true; // looking for a letter?
276
277 if ($str{$pos} == '') return false; // Return false on end-of-string at this stage
278
279 while(1) {
280
281 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
282 if ($len) {
283 if ($letter) { // We are in a sequence of words
284 if (!$cType // The char was NOT a letter
285 || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets
286 ) {
287 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
288 if (!in_array($cp,$this->lexerConf['printjoins'])) {
289 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
290 if ($printJoinLgd) {
291 $len = $printJoinLgd;
292 }
293 #debug($cp);
294 return true;
295 } else { // If a printJoin char is found, record the length if it has not been recorded already:
296 if (!$printJoinLgd) $printJoinLgd = $len;
297 }
298 } else { // When a true letter is found, reset printJoinLgd counter:
299 $printJoinLgd = 0;
300 }
301 }
302 elseif (!$letter && $cType) { // end of non-word reached
303 return false;
304 }
305 }
306 $len += $bc; // add byte-length of last found character
307
308 if ($str{$pos} == '') return $letter; // end of string; return status of string till now
309
310 // Get next chars unicode number:
311 $cp = $this->utf8_ord($str,$bc,$pos);
312 $pos += $bc;
313
314 // Determine the type:
315 $cType_prev = $cType;
316 list($cType) = $this->charType($cp);
317 if ($cType) {
318 continue;
319 }
320
321 // Setting letter to false if the first char was not a letter!
322 if (!$len) $letter = false;
323 }
324
325 return false;
326 }
327
328 /**
329 * Determine the type of character
330 *
331 * @param integer Unicode number to evaluate
332 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
333 */
334 function charType($cp) {
335
336 // Numeric?
337 if (
338 ($cp >= 0x30 && $cp <= 0x39) // Arabic
339 /*
340 ($cp >= 0x660 && $cp <= 0x669) || // Arabic-Indic
341 ($cp >= 0x6F0 && $cp <= 0x6F9) || // Arabic-Indic (Iran, Pakistan, and India)
342 ($cp >= 0x3021 && $cp <= 0x3029) || // Hangzhou
343 */
344 ) {
345 return array('num');
346 }
347
348 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
349 if (
350 ($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters
351 ($cp >= 0x61 && $cp <= 0x7A) || // Basic Latin: small letters
352 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
353 ($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B
354 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters
355 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) || // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
356 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) || // Hebrew: only accents and letters
357 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) || // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
358 ($cp >= 0x1E00 && $cp < 0x2000) // Latin Extended Additional and Greek Extended
359 ) {
360 return array('alpha');
361 }
362
363 // Looking for CJK (Chinese / Japanese / Korean)
364 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
365 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
366 if (
367 ($cp >= 0x3040 && $cp <= 0x30FF) || // HIRAGANA and KATAKANA letters
368 ($cp >= 0x3130 && $cp <= 0x318F) || // Hangul Compatibility Jamo
369 ($cp >= 0x3400 && $cp <= 0x4DBF) || // CJK Unified Ideographs Extension A
370 ($cp >= 0x4E00 && $cp <= 0x9FAF) || // CJK Unified Ideographs
371 ($cp >= 0xAC00 && $cp <= 0xD7AF) || // Hangul Syllables
372 ($cp >= 0x20000 && $cp <= 0x2FA1F) // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement
373 // also include CJK and Kangxi radicals or Bopomofo letter?
374 ) {
375 return array('cjk');
376 }
377 }
378
379 /**
380 * Converts a UTF-8 multibyte character to a UNICODE codepoint
381 *
382 * @param string UTF-8 multibyte character string (reference)
383 * @param integer The length of the character (reference, return value)
384 * @param integer Starting position in input string
385 * @param boolean If set, then a hex. number is returned
386 * @return integer UNICODE codepoint
387 */
388 function utf8_ord(&$str, &$len, $pos=0, $hex=false) {
389 $ord = ord($str{$pos});
390 $len = 1;
391
392 if ($ord > 0x80) {
393 for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of extra bytes
394 $len += $bc;
395
396 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
397 for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes
398 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
399 }
400
401 return $hex ? 'x'.dechex($ord) : $ord;
402 }
403 }
404
405
406 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php'])) {
407 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
408 }
409 ?>