See Changelog: Updates to Indexed Search (mainly), t3lib_cs (bug), t3lib_tcemain...
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.lexer.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Lexer for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 91: class tx_indexedsearch_lexer
39 * 105: function tx_indexedsearch_lexer()
40 * 117: function split2Words($wordString)
41 *
42 * SECTION: Helper functions
43 * 176: function utf8_ord(&$str, &$len, $pos=0, $hex=false)
44 * 201: function utf8_is_letter(&$str, &$len, $pos=0, $scan=false)
45 * 284: function get_word($charset, &$str, $pos=0)
46 *
47 * TOTAL FUNCTIONS: 5
48 * (This index is automatically created/updated by the extension "extdeveval")
49 *
50 */
51
52
53
54 /*
55
56 DESCRIPTION OF (CJK) ALGORITHM
57
58 Continuous letters and numbers make up words. Spaces and symbols
59 separate letters and numbers into words. This is sufficient for
60 all western text.
61
62 CJK doesn't use spaces or separators to separate words, so the only
63 way to really find out what constitutes a word would be to have a
64 dictionary and advanced heuristics. Instead, we form pairs from
65 consecutive characters, in such a way that searches will find only
66 characters that appear more-or-less the right sequence. For example:
67
68 ABCDE => AB BC CD DE
69
70 This works okay since both the index and the search query is split
71 in the same manner, and since the set of characters is huge so the
72 extra matches are not significant.
73
74 */
75
76
77
78
79
80
81
82
83 /**
84 * Lexer class for indexed_search
85 * A lexer splits the text into words
86 *
87 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
88 * @package TYPO3
89 * @subpackage tx_indexedsearch
90 */
91 class tx_indexedsearch_lexer {
92
93 var $debug = FALSE;
94 var $debugString = '';
95
96 var $csObj; // Charset class object , t3lib_cs
97
98
99
100 /**
101 * Constructor: Initializes the charset class, t3lib_cs
102 *
103 * @return void
104 */
105 function tx_indexedsearch_lexer() {
106 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
107 }
108
109
110 /**
111 * Splitting string into words.
112 * Used for indexing, can also be used to find words in query.
113 *
114 * @param string String with UTF-8 content to process.
115 * @return array Array of words in utf-8
116 */
117 function split2Words($wordString) {
118
119 // Reset debug string:
120 $this->debugString = '';
121
122 // Then convert the string to lowercase:
123 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
124
125 // Now, splitting words:
126 $len = 0;
127 $start = 0;
128 $pos = 0;
129 $words = array();
130 $this->debugString = '';
131
132 while(1) {
133 list($start,$len) = $this->get_word('utf-8', $wordString, $pos);
134 if ($len) {
135 $words[] = substr($wordString,$start,$len);
136
137 if ($this->debug) {
138 $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.htmlspecialchars(substr($wordString,$start,$len));
139 }
140
141 $pos = $start+$len;
142 } else break;
143 }
144
145 return $words;
146 }
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161 /************************************
162 *
163 * Helper functions
164 *
165 ************************************/
166
167 /**
168 * Converts a UTF-8 multibyte character to a UNICODE codepoint
169 *
170 * @param string UTF-8 multibyte character string (reference)
171 * @param integer The length of the character (reference, return value)
172 * @param integer Starting position in input string
173 * @param boolean If set, then a hex. number is returned
174 * @return integer UNICODE codepoint
175 */
176 function utf8_ord(&$str, &$len, $pos=0, $hex=false) {
177 $ord = ord($str{$pos});
178 $len = 1;
179
180 if ($ord > 0x80) {
181 for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of extra bytes
182 $len += $bc;
183
184 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
185 for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes
186 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
187 }
188
189 return $hex ? 'x'.dechex($ord) : $ord;
190 }
191
192 /**
193 * See if a character is a letter (or a string of letters or non-letters).
194 *
195 * @param string Input string (reference)
196 * @param integer Byte-length of character sequence (reference, return value)
197 * @param integer Starting position in input string
198 * @param boolean If set will scan for a whole sequence of characters
199 * @return boolean letter (or word) found
200 */
201 function utf8_is_letter(&$str, &$len, $pos=0, $scan=false) {
202 global $cs;
203
204 $len = 0;
205 $bc = 0;
206 $found = false; // found a letter
207 $letter = true; // looking for a letter?
208
209 if ($str{$pos} == '') return false;
210
211 while(1) {
212 if ($len) {
213 if ($scan) {
214 if ($letter && !$found) { // end of word reached
215 return true;
216 }
217 elseif (!$letter && $found) { // end of non-word reached
218 return false;
219 }
220 }
221 else {
222 return $found; // report single letter status
223 }
224 }
225 $len += $bc; // add byte-length of last found character
226 $found = false;
227
228 if ($str{$pos} == '') return $letter; // end of string
229
230 $cp = $this->utf8_ord($str,$bc,$pos);
231 $pos += $bc;
232
233 if ($cp >= 0x41 && $cp <= 0x5A || // Basic Latin: capital letters
234 $cp >= 0x30 && $cp <= 0x39 || // Numbers
235 $cp >= 0x61 && $cp <= 0x7A) { // small letters
236 $found = true;
237 continue;
238 }
239
240 if ($cp >= 0xC0 && $cp <= 0xFF) { // Latin-1 Supplement (0x80-0xFF)
241 // 0x80-0x9F are unassigned
242 // 0xA0-0xBF are non-letters
243
244 if ($cp != 0xD7 && $cp != 0xF7) { // multiplication and division sign
245 $found = true;
246 continue;
247 }
248 } elseif ($cp >= 0x100 && $cp < 0x280) { // Latin Extended-A and -B
249 $found = true;
250 continue;
251 } elseif ($cp >= 0x370 && $cp < 0x400) { // Greek and Coptic
252 $found = true;
253 continue;
254 } elseif ($cp >= 0x400 && $cp < 0x530) { // Cyrillic and Cyrillic Supplement
255 $found = true;
256 continue;
257 } elseif ($cp >= 0x590 && $cp < 0x600) { // Hebrew
258 $found = true;
259 continue;
260 } elseif ($cp >= 0x600 && $cp < 0x700) { // Arabic
261 $found = true;
262 continue;
263 }
264 // I dont't think we need to support these:
265 // Latin Extended Additional
266 // Greek Extended
267 // Alphabetic Presentation Forms
268 // Arabic Presentation Forms-A
269 // Arabic Presentation Forms-B
270
271 if (!$len) $letter = false;
272 }
273
274 return false;
275 }
276
277 /**
278 * Get the first word in a given string (initial non-letters will be skipped)
279 *
280 * @param string The charset
281 * @param string Input string (reference)
282 * @param integer Starting position in input string
283 * @return array 0: start, 1: len or false if no word has been found
284 */
285 function get_word($charset, &$str, $pos=0) {
286 if ($charset == 'utf-8') {
287 $letters = $this->utf8_is_letter($str, $len, $pos, true);
288 if ($letters) return array($pos,$len); // word found
289
290 $pos += $len;
291 if ($str{$pos} == '') return false; // end of string
292
293 $this->utf8_is_letter($str, $len, $pos, true);
294 return array($pos,$len);
295 }
296
297 return false;
298 }
299 }
300
301
302 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']) {
303 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
304 }
305 ?>