[TASK] Merge submodule version into core
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Lexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2001-2013 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the TYPO3 project. The TYPO3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 * A copy is found in the textfile GPL.txt and important notices to the license
19 * from the author is found in LICENSE.txt distributed with these scripts.
20 *
21 *
22 * This script is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * This copyright notice MUST APPEAR in all copies of the script!
28 ***************************************************************/
29 /**
30 * Lexer for indexed_search
31 *
32 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
33 */
34 /**
35 * Lexer class for indexed_search
36 * A lexer splits the text into words
37 *
38 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
39 */
40 class Lexer {
41
42 // Debugging options:
43 /**
44 * @todo Define visibility
45 */
46 public $debug = FALSE;
47
48 // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
49 /**
50 * @todo Define visibility
51 */
52 public $debugString = '';
53
54 /**
55 * Charset class object
56 *
57 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
58 * @todo Define visibility
59 */
60 public $csObj;
61
62 // Configuration of the lexer:
63 /**
64 * @todo Define visibility
65 */
66 public $lexerConf = array(
67 'printjoins' => array(46, 45, 95, 58, 47, 39),
68 'casesensitive' => FALSE,
69 // Set, if case sensitive indexing is wanted.
70 'removeChars' => array(45)
71 );
72
73 /**
74 * Constructor: Initializes the charset class
75 *
76 * @return void
77 * @todo Define visibility
78 */
79 public function __construct() {
80 $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Charset\\CharsetConverter');
81 }
82
83 /**
84 * Splitting string into words.
85 * Used for indexing, can also be used to find words in query.
86 *
87 * @param string String with UTF-8 content to process.
88 * @return array Array of words in utf-8
89 * @todo Define visibility
90 */
91 public function split2Words($wordString) {
92 // Reset debug string:
93 $this->debugString = '';
94 // Then convert the string to lowercase:
95 if (!$this->lexerConf['casesensitive']) {
96 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
97 }
98 // Now, splitting words:
99 $len = 0;
100 $start = 0;
101 $pos = 0;
102 $words = array();
103 $this->debugString = '';
104 while (1) {
105 list($start, $len) = $this->get_word($wordString, $pos);
106 if ($len) {
107 $this->addWords($words, $wordString, $start, $len);
108 if ($this->debug) {
109 $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
110 }
111 $pos = $start + $len;
112 } else {
113 break;
114 }
115 }
116 return $words;
117 }
118
119 /**********************************
120 *
121 * Helper functions
122 *
123 ********************************/
124 /**
125 * Add word to word-array
126 * This function should be used to make sure CJK sequences are split up in the right way
127 *
128 * @param array Array of accumulated words
129 * @param string Complete Input string from where to extract word
130 * @param integer Start position of word in input string
131 * @param integer The Length of the word string from start position
132 * @return void
133 * @todo Define visibility
134 */
135 public function addWords(&$words, &$wordString, $start, $len) {
136 // Get word out of string:
137 $theWord = substr($wordString, $start, $len);
138 // Get next chars unicode number and find type:
139 $bc = 0;
140 $cp = $this->utf8_ord($theWord, $bc);
141 list($cType) = $this->charType($cp);
142 // If string is a CJK sequence we follow this algorithm:
143 /*
144 DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
145 separate letters and numbers into words. This is sufficient for
146 all western text.CJK doesn't use spaces or separators to separate words, so the only
147 way to really find out what constitutes a word would be to have a
148 dictionary and advanced heuristics. Instead, we form pairs from
149 consecutive characters, in such a way that searches will find only
150 characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
151 in the same manner, and since the set of characters is huge so the
152 extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
153 */
154 if ($cType == 'cjk') {
155 // Find total string length:
156 $strlen = $this->csObj->utf8_strlen($theWord);
157 // Traverse string length and add words as pairs of two chars:
158 for ($a = 0; $a < $strlen; $a++) {
159 if ($strlen == 1 || $a < $strlen - 1) {
160 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
161 }
162 }
163 } else {
164 // Normal "single-byte" chars:
165 // Remove chars:
166 foreach ($this->lexerConf['removeChars'] as $skipJoin) {
167 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
168 }
169 // Add word:
170 $words[] = $theWord;
171 }
172 }
173
174 /**
175 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
176 *
177 * @param string Input string (reference)
178 * @param integer Starting position in input string
179 * @return array 0: start, 1: len or FALSE if no word has been found
180 * @todo Define visibility
181 */
182 public function get_word(&$str, $pos = 0) {
183 $len = 0;
184 // If return is TRUE, a word was found starting at this position, so returning position and length:
185 if ($this->utf8_is_letter($str, $len, $pos)) {
186 return array($pos, $len);
187 }
188 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
189 $pos += $len;
190 if ($str[$pos] == '') {
191 // Check end of string before looking for word of course.
192 return FALSE;
193 }
194 $this->utf8_is_letter($str, $len, $pos);
195 return array($pos, $len);
196 }
197
198 /**
199 * See if a character is a letter (or a string of letters or non-letters).
200 *
201 * @param string Input string (reference)
202 * @param integer Byte-length of character sequence (reference, return value)
203 * @param integer Starting position in input string
204 * @return boolean letter (or word) found
205 * @todo Define visibility
206 */
207 public function utf8_is_letter(&$str, &$len, $pos = 0) {
208 global $cs;
209 $len = 0;
210 $bc = 0;
211 $cType = ($cType_prev = FALSE);
212 // Letter type
213 $letter = TRUE;
214 // looking for a letter?
215 if ($str[$pos] == '') {
216 // Return FALSE on end-of-string at this stage
217 return FALSE;
218 }
219 while (1) {
220 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
221 if ($len) {
222 if ($letter) {
223 // We are in a sequence of words
224 if (!$cType || $cType_prev == 'cjk' && \TYPO3\CMS\Core\Utility\GeneralUtility::inList('num,alpha', $cType) || $cType == 'cjk' && \TYPO3\CMS\Core\Utility\GeneralUtility::inList('num,alpha', $cType_prev)) {
225 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
226 if (!in_array($cp, $this->lexerConf['printjoins'])) {
227 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
228 if ($printJoinLgd) {
229 $len = $printJoinLgd;
230 }
231 return TRUE;
232 } else {
233 // If a printJoin char is found, record the length if it has not been recorded already:
234 if (!$printJoinLgd) {
235 $printJoinLgd = $len;
236 }
237 }
238 } else {
239 // When a true letter is found, reset printJoinLgd counter:
240 $printJoinLgd = 0;
241 }
242 } elseif (!$letter && $cType) {
243 // end of non-word reached
244 return FALSE;
245 }
246 }
247 $len += $bc;
248 // add byte-length of last found character
249 if ($str[$pos] == '') {
250 // End of string; return status of string till now
251 return $letter;
252 }
253 // Get next chars unicode number:
254 $cp = $this->utf8_ord($str, $bc, $pos);
255 $pos += $bc;
256 // Determine the type:
257 $cType_prev = $cType;
258 list($cType) = $this->charType($cp);
259 if ($cType) {
260 continue;
261 }
262 // Setting letter to FALSE if the first char was not a letter!
263 if (!$len) {
264 $letter = FALSE;
265 }
266 }
267 return FALSE;
268 }
269
270 /**
271 * Determine the type of character
272 *
273 * @param integer Unicode number to evaluate
274 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
275 * @todo Define visibility
276 */
277 public function charType($cp) {
278 // Numeric?
279 if ($cp >= 48 && $cp <= 57) {
280 return array('num');
281 }
282 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
283 if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
284 return array('alpha');
285 }
286 // Looking for CJK (Chinese / Japanese / Korean)
287 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
288 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
289 if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
290 return array('cjk');
291 }
292 }
293
294 /**
295 * Converts a UTF-8 multibyte character to a UNICODE codepoint
296 *
297 * @param string UTF-8 multibyte character string (reference)
298 * @param integer The length of the character (reference, return value)
299 * @param integer Starting position in input string
300 * @param boolean If set, then a hex. number is returned
301 * @return integer UNICODE codepoint
302 * @todo Define visibility
303 */
304 public function utf8_ord(&$str, &$len, $pos = 0, $hex = FALSE) {
305 $ord = ord($str[$pos]);
306 $len = 1;
307 if ($ord > 128) {
308 for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
309 // calculate number of extra bytes
310 $bc++;
311 }
312 $len += $bc;
313 $ord = $ord & (1 << 6 - $bc) - 1;
314 // mask utf-8 lead-in bytes
315 // "bring in" data bytes
316 for ($i = $pos + 1; $bc; $bc--, $i++) {
317 $ord = $ord << 6 | ord($str[$i]) & 63;
318 }
319 }
320 return $hex ? 'x' . dechex($ord) : $ord;
321 }
322
323 }
324
325
326 ?>