Updating alpha chars info in charType().
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.lexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Lexer for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 73: class tx_indexedsearch_lexer
39 * 104: function tx_indexedsearch_lexer()
40 * 115: function split2Words($wordString)
41 *
42 * SECTION: Helper functions
43 * 178: function addWords(&$words, &$wordString, $start, $len)
44 * 239: function get_word(&$str, $pos=0)
45 * 264: function utf8_is_letter(&$str, &$len, $pos=0)
46 * 328: function charType($cp)
47 * 371: function utf8_ord(&$str, &$len, $pos=0, $hex=false)
48 *
49 * TOTAL FUNCTIONS: 7
50 * (This index is automatically created/updated by the extension "extdeveval")
51 *
52 */
53
54
55
56
57
58
59
60
61
62
63
64
65 /**
66 * Lexer class for indexed_search
67 * A lexer splits the text into words
68 *
69 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
70 * @package TYPO3
71 * @subpackage tx_indexedsearch
72 */
73 class tx_indexedsearch_lexer {
74
75 // Debugging options:
76 var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
77 var $debugString = '';
78
79 var $csObj; // Charset class object , t3lib_cs
80
81
82 // Configuration of the lexer:
83 var $lexerConf = array(
84 'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
85 0x2e, // "."
86 0x2d, // "-"
87 0x5f, // "_"
88 0x3a, // ":"
89 0x2f, // "/"
90 0x2d, // "-" DUPE
91 0x27, // "'"
92 // 0x615 ARABIC SMALL HIGH TAH
93 ),
94 'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted.
95 'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
96 0x2d // "-"
97 )
98 );
99
100
101 /**
102 * Constructor: Initializes the charset class, t3lib_cs
103 *
104 * @return void
105 */
106 function tx_indexedsearch_lexer() {
107 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
108 }
109
110 /**
111 * Splitting string into words.
112 * Used for indexing, can also be used to find words in query.
113 *
114 * @param string String with UTF-8 content to process.
115 * @return array Array of words in utf-8
116 */
117 function split2Words($wordString) {
118
119 // Reset debug string:
120 $this->debugString = '';
121
122 // Then convert the string to lowercase:
123 if (!$this->lexerConf['casesensitive']) {
124 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
125 }
126
127 // Now, splitting words:
128 $len = 0;
129 $start = 0;
130 $pos = 0;
131 $words = array();
132 $this->debugString = '';
133
134 while(1) {
135 list($start,$len) = $this->get_word($wordString, $pos);
136 if ($len) {
137
138 $this->addWords($words, $wordString,$start,$len);
139
140 if ($this->debug) {
141 $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
142 htmlspecialchars(substr($wordString,$start,$len));
143 }
144
145 $pos = $start+$len;
146 } else break;
147 }
148 return $words;
149 }
150
151
152
153
154
155
156
157
158
159
160
161
162 /**********************************
163 *
164 * Helper functions
165 *
166 ********************************/
167
168
169 /**
170 * Add word to word- array
171 * This function should be used to make sure CJK sequences are split up in the right way
172 *
173 * @param array Array of accumulated words
174 * @param string Complete Input string from where to extract word
175 * @param integer Start position of word in input string
176 * @param integer The Length of the word string from start position
177 * @return void
178 */
179 function addWords(&$words, &$wordString, $start, $len) {
180
181 // Get word out of string:
182 $theWord = substr($wordString,$start,$len);
183
184 // Get next chars unicode number and find type:
185 $bc = 0;
186 $cp = $this->utf8_ord($theWord, $bc);
187 list($cType) = $this->charType($cp);
188
189 // If string is a CJK sequence we follow this algorithm:
190 /*
191 DESCRIPTION OF (CJK) ALGORITHM
192
193 Continuous letters and numbers make up words. Spaces and symbols
194 separate letters and numbers into words. This is sufficient for
195 all western text.
196
197 CJK doesn't use spaces or separators to separate words, so the only
198 way to really find out what constitutes a word would be to have a
199 dictionary and advanced heuristics. Instead, we form pairs from
200 consecutive characters, in such a way that searches will find only
201 characters that appear more-or-less the right sequence. For example:
202
203 ABCDE => AB BC CD DE
204
205 This works okay since both the index and the search query is split
206 in the same manner, and since the set of characters is huge so the
207 extra matches are not significant.
208
209 (Hint taken from ZOPEs chinese user group)
210
211 [Kasper: As far as I can see this will only work well with or-searches!]
212 */
213 if ($cType == 'cjk') {
214 // Find total string length:
215 $strlen = $this->csObj->utf8_strlen($theWord);
216
217 // Traverse string length and add words as pairs of two chars:
218 for ($a=0; $a<$strlen; $a++) {
219 if ($strlen==1 || $a<$strlen-1) {
220 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
221 }
222 }
223 } else { // Normal "single-byte" chars:
224 // Remove chars:
225 foreach($this->lexerConf['removeChars'] as $skipJoin) {
226 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
227 }
228 // Add word:
229 $words[] = $theWord;
230 }
231 }
232
233 /**
234 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
235 *
236 * @param string Input string (reference)
237 * @param integer Starting position in input string
238 * @return array 0: start, 1: len or false if no word has been found
239 */
240 function get_word(&$str, $pos=0) {
241
242 $len=0;
243
244 // If return is true, a word was found starting at this position, so returning position and length:
245 if ($this->utf8_is_letter($str, $len, $pos)) {
246 return array($pos,$len);
247 }
248
249 // If the return value was false it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
250 $pos += $len;
251 if ($str{$pos} == '') return false; // check end of string before looking for word of course.
252
253 $this->utf8_is_letter($str, $len, $pos);
254 return array($pos,$len);
255 }
256
257 /**
258 * See if a character is a letter (or a string of letters or non-letters).
259 *
260 * @param string Input string (reference)
261 * @param integer Byte-length of character sequence (reference, return value)
262 * @param integer Starting position in input string
263 * @return boolean letter (or word) found
264 */
265 function utf8_is_letter(&$str, &$len, $pos=0) {
266 global $cs;
267
268 $len = 0;
269 $bc = 0;
270 $cType = $cType_prev = false; // Letter type
271 $letter = true; // looking for a letter?
272
273 if ($str{$pos} == '') return false; // Return false on end-of-string at this stage
274
275 while(1) {
276
277 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
278 if ($len) {
279 if ($letter) { // We are in a sequence of words
280 if (!$cType // The char was NOT a letter
281 || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets
282 ) {
283 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
284 if (!in_array($cp,$this->lexerConf['printjoins'])) {
285 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
286 if ($printJoinLgd) {
287 $len = $printJoinLgd;
288 }
289 #debug($cp);
290 return true;
291 } else { // If a printJoin char is found, record the length if it has not been recorded already:
292 if (!$printJoinLgd) $printJoinLgd = $len;
293 }
294 } else { // When a true letter is found, reset printJoinLgd counter:
295 $printJoinLgd = 0;
296 }
297 }
298 elseif (!$letter && $cType) { // end of non-word reached
299 return false;
300 }
301 }
302 $len += $bc; // add byte-length of last found character
303
304 if ($str{$pos} == '') return $letter; // end of string; return status of string till now
305
306 // Get next chars unicode number:
307 $cp = $this->utf8_ord($str,$bc,$pos);
308 $pos += $bc;
309
310 // Determine the type:
311 $cType_prev = $cType;
312 list($cType) = $this->charType($cp);
313 if ($cType) {
314 continue;
315 }
316
317 // Setting letter to false if the first char was not a letter!
318 if (!$len) $letter = false;
319 }
320
321 return false;
322 }
323
324 /**
325 * Determine the type of character
326 *
327 * @param integer Unicode number to evaluate
328 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
329 */
330 function charType($cp) {
331
332 // Numeric?
333 if (
334 ($cp >= 0x30 && $cp <= 0x39) // Arabic
335 /*
336 ($cp >= 0x660 && $cp <= 0x669) || // Arabic-Indic
337 ($cp >= 0x6F0 && $cp <= 0x6F9) || // Arabic-Indic (Iran, Pakistan, and India)
338 ($cp >= 0x3021 && $cp <= 0x3029) || // Hangzhou
339 */
340 ) {
341 return array('num');
342 }
343
344 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
345 if (
346 ($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters
347 ($cp >= 0x61 && $cp <= 0x7A) || // Basic Latin: small letters
348 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
349 ($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B
350 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) // Greek and Coptic excluding non-letters
351 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) || // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
352 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) || // Hebrew: only accents and letters
353 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) || // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
354 ($cp >= 0x1E00 && $cp < 0x2000) // Latin Extended Additional and Greek Extended
355 ) {
356 return array('alpha');
357 }
358
359 // Looking for CJK (Chinese / Japanese / Korean)
360 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
361 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
362 if (
363 ($cp >= 0x4E00 && $cp <= 0x9FAF) || // CJK Unified Ideographs
364 ($cp >= 0xAC00 && $cp <= 0xD7AF) || // Hangul Syllables
365 ($cp >= 0x3130 && $cp <= 0x318F) || // Hangul Compatibility Jamo
366 ($cp >= 0x3040 && $cp <= 0x309F) || // HIRAGANA letters
367 ($cp >= 0x30A0 && $cp <= 0x30FF) // KATAKANA letters
368 ) {
369 return array('cjk');
370 }
371 }
372
373 /**
374 * Converts a UTF-8 multibyte character to a UNICODE codepoint
375 *
376 * @param string UTF-8 multibyte character string (reference)
377 * @param integer The length of the character (reference, return value)
378 * @param integer Starting position in input string
379 * @param boolean If set, then a hex. number is returned
380 * @return integer UNICODE codepoint
381 */
382 function utf8_ord(&$str, &$len, $pos=0, $hex=false) {
383 $ord = ord($str{$pos});
384 $len = 1;
385
386 if ($ord > 0x80) {
387 for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of extra bytes
388 $len += $bc;
389
390 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
391 for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes
392 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
393 }
394
395 return $hex ? 'x'.dechex($ord) : $ord;
396 }
397 }
398
399
400 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']) {
401 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
402 }
403 ?>