[!!!][TASK] Remove deprecated code from EXT:indexed_search
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Lexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Charset\CharsetConverter;
18 use TYPO3\CMS\Core\Utility\GeneralUtility;
19
20 /**
21 * Lexer class for indexed_search
22 * A lexer splits the text into words
23 * @internal
24 */
25 class Lexer
26 {
27
28 /**
29 * Debugging options:
30 *
31 * @var bool
32 */
33 public $debug = false;
34
35 /**
36 * If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
37 *
38 * @var string
39 */
40 public $debugString = '';
41
42 /**
43 * Configuration of the lexer:
44 *
45 * @var array
46 */
47 public $lexerConf = [
48 //Characters: . - _ : / '
49 'printjoins' => [46, 45, 95, 58, 47, 39],
50 'casesensitive' => false,
51 // Set, if case sensitive indexing is wanted.
52 'removeChars' => [45]
53 ];
54
55 /**
56 * Splitting string into words.
57 * Used for indexing, can also be used to find words in query.
58 *
59 * @param string String with UTF-8 content to process.
60 * @return array Array of words in utf-8
61 */
62 public function split2Words($wordString)
63 {
64 // Reset debug string:
65 $this->debugString = '';
66 // Then convert the string to lowercase:
67 if (!$this->lexerConf['casesensitive']) {
68 $wordString = mb_strtolower($wordString, 'utf-8');
69 }
70 // Now, splitting words:
71 $len = 0;
72 $start = 0;
73 $pos = 0;
74 $words = [];
75 $this->debugString = '';
76 while (1) {
77 list($start, $len) = $this->get_word($wordString, $pos);
78 if ($len) {
79 $this->addWords($words, $wordString, $start, $len);
80 if ($this->debug) {
81 $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr(
82 $wordString,
83 $pos,
84 $start - $pos
85 )) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
86 }
87 $pos = $start + $len;
88 } else {
89 break;
90 }
91 }
92 return $words;
93 }
94
95 /**********************************
96 *
97 * Helper functions
98 *
99 ********************************/
100 /**
101 * Add word to word-array
102 * This function should be used to make sure CJK sequences are split up in the right way
103 *
104 * @param array $words Array of accumulated words
105 * @param string $wordString Complete Input string from where to extract word
106 * @param int $start Start position of word in input string
107 * @param int $len The Length of the word string from start position
108 */
109 public function addWords(&$words, &$wordString, $start, $len)
110 {
111 // Get word out of string:
112 $theWord = substr($wordString, $start, $len);
113 // Get next chars unicode number and find type:
114 $bc = 0;
115 $cp = $this->utf8_ord($theWord, $bc);
116 list($cType) = $this->charType($cp);
117 // If string is a CJK sequence we follow this algorithm:
118 /*
119 DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
120 separate letters and numbers into words. This is sufficient for
121 all western text.CJK doesn't use spaces or separators to separate words, so the only
122 way to really find out what constitutes a word would be to have a
123 dictionary and advanced heuristics. Instead, we form pairs from
124 consecutive characters, in such a way that searches will find only
125 characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
126 in the same manner, and since the set of characters is huge so the
127 extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
128 */
129 if ($cType === 'cjk') {
130 // Find total string length:
131 $strlen = mb_strlen($theWord, 'utf-8');
132 // Traverse string length and add words as pairs of two chars:
133 for ($a = 0; $a < $strlen; $a++) {
134 if ($strlen == 1 || $a < $strlen - 1) {
135 $words[] = mb_substr($theWord, $a, 2, 'utf-8');
136 }
137 }
138 } else {
139 // Normal "single-byte" chars:
140 // Remove chars:
141 $charsetConverter = GeneralUtility::makeInstance(CharsetConverter::class);
142 foreach ($this->lexerConf['removeChars'] as $skipJoin) {
143 $theWord = str_replace($charsetConverter->UnumberToChar($skipJoin), '', $theWord);
144 }
145 // Add word:
146 $words[] = $theWord;
147 }
148 }
149
150 /**
151 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
152 *
153 * @param string $str Input string (reference)
154 * @param int $pos Starting position in input string
155 * @return array 0: start, 1: len or FALSE if no word has been found
156 */
157 public function get_word(&$str, $pos = 0)
158 {
159 $len = 0;
160 // If return is TRUE, a word was found starting at this position, so returning position and length:
161 if ($this->utf8_is_letter($str, $len, $pos)) {
162 return [$pos, $len];
163 }
164 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
165 $pos += $len;
166 if ($str[$pos] == '') {
167 // Check end of string before looking for word of course.
168 return false;
169 }
170 $this->utf8_is_letter($str, $len, $pos);
171 return [$pos, $len];
172 }
173
174 /**
175 * See if a character is a letter (or a string of letters or non-letters).
176 *
177 * @param string $str Input string (reference)
178 * @param int $len Byte-length of character sequence (reference, return value)
179 * @param int $pos Starting position in input string
180 * @return bool letter (or word) found
181 */
182 public function utf8_is_letter(&$str, &$len, $pos = 0)
183 {
184 $len = 0;
185 $bc = 0;
186 $cp = 0;
187 $printJoinLgd = 0;
188 $cType = ($cType_prev = false);
189 // Letter type
190 $letter = true;
191 // looking for a letter?
192 if ($str[$pos] == '') {
193 // Return FALSE on end-of-string at this stage
194 return false;
195 }
196 while (1) {
197 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
198 if ($len) {
199 if ($letter) {
200 // We are in a sequence of words
201 if (
202 !$cType
203 || $cType_prev === 'cjk' && ($cType === 'num' || $cType === 'alpha')
204 || $cType === 'cjk' && ($cType_prev === 'num' || $cType_prev === 'alpha')
205 ) {
206 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
207 if (!in_array($cp, $this->lexerConf['printjoins'])) {
208 // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
209 if ($printJoinLgd) {
210 $len = $printJoinLgd;
211 }
212 return true;
213 }
214 // If a printJoin char is found, record the length if it has not been recorded already:
215 if (!$printJoinLgd) {
216 $printJoinLgd = $len;
217 }
218 } else {
219 // When a true letter is found, reset printJoinLgd counter:
220 $printJoinLgd = 0;
221 }
222 } elseif (!$letter && $cType) {
223 // end of non-word reached
224 return false;
225 }
226 }
227 $len += $bc;
228 // add byte-length of last found character
229 if ($str[$pos] == '') {
230 // End of string; return status of string till now
231 return $letter;
232 }
233 // Get next chars unicode number:
234 $cp = $this->utf8_ord($str, $bc, $pos);
235 $pos += $bc;
236 // Determine the type:
237 $cType_prev = $cType;
238 list($cType) = $this->charType($cp);
239 if ($cType) {
240 continue;
241 }
242 // Setting letter to FALSE if the first char was not a letter!
243 if (!$len) {
244 $letter = false;
245 }
246 }
247 return false;
248 }
249
250 /**
251 * Determine the type of character
252 *
253 * @param int $cp Unicode number to evaluate
254 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
255 */
256 public function charType($cp)
257 {
258 // Numeric?
259 if ($cp >= 48 && $cp <= 57) {
260 return ['num'];
261 }
262 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
263 if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
264 return ['alpha'];
265 }
266 // Looking for CJK (Chinese / Japanese / Korean)
267 // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
268 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
269 if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
270 return ['cjk'];
271 }
272 }
273
274 /**
275 * Converts a UTF-8 multibyte character to a UNICODE codepoint
276 *
277 * @param string $str UTF-8 multibyte character string (reference)
278 * @param int $len The length of the character (reference, return value)
279 * @param int $pos Starting position in input string
280 * @param bool $hex If set, then a hex. number is returned
281 * @return int UNICODE codepoint
282 */
283 public function utf8_ord(&$str, &$len, $pos = 0, $hex = false)
284 {
285 $ord = ord($str[$pos]);
286 $len = 1;
287 if ($ord > 128) {
288 for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
289 // calculate number of extra bytes
290 $bc++;
291 }
292 $len += $bc;
293 $ord = $ord & (1 << 6 - $bc) - 1;
294 // mask utf-8 lead-in bytes
295 // "bring in" data bytes
296 for ($i = $pos + 1; $bc; $bc--, $i++) {
297 $ord = $ord << 6 | ord($str[$i]) & 63;
298 }
299 }
300 return $hex ? 'x' . dechex($ord) : $ord;
301 }
302 }