[TASK] Remove unused CMD property from SchedulerModuleController
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Lexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Charset\CharsetConverter;
18 use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
19 use TYPO3\CMS\Core\Utility\GeneralUtility;
20
21 /**
22 * Lexer class for indexed_search
23 * A lexer splits the text into words
24 * @internal
25 */
26 class Lexer
27 {
28 use PublicPropertyDeprecationTrait;
29
30 /**
31 * List of all deprecated public properties
32 * @var array
33 */
34 protected $deprecatedPublicProperties = [
35 'csObj' => 'Using $csObj within Indexing is discouraged, the property will be removed in TYPO3 v10.0 - if needed instantiate CharsetConverter yourself.',
36 ];
37
38 /**
39 * Debugging options:
40 *
41 * @var bool
42 */
43 public $debug = false;
44
45 /**
46 * If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
47 *
48 * @var string
49 */
50 public $debugString = '';
51
52 /**
53 * Charset class object
54 *
55 * @var CharsetConverter
56 * @deprecated since TYPO3 v9.3, will be removed in TYPO3 v10.0 (also the instantiation in the init() method).
57 */
58 public $csObj;
59
60 /**
61 * Configuration of the lexer:
62 *
63 * @var array
64 */
65 public $lexerConf = [
66 //Characters: . - _ : / '
67 'printjoins' => [46, 45, 95, 58, 47, 39],
68 'casesensitive' => false,
69 // Set, if case sensitive indexing is wanted.
70 'removeChars' => [45]
71 ];
72
73 /**
74 * Constructor: Initializes the charset class
75 */
76 public function __construct()
77 {
78 // @deprecated, can be removed in TYPO3 v10.0.
79 $this->csObj = GeneralUtility::makeInstance(CharsetConverter::class);
80 }
81
82 /**
83 * Splitting string into words.
84 * Used for indexing, can also be used to find words in query.
85 *
86 * @param string String with UTF-8 content to process.
87 * @return array Array of words in utf-8
88 */
89 public function split2Words($wordString)
90 {
91 // Reset debug string:
92 $this->debugString = '';
93 // Then convert the string to lowercase:
94 if (!$this->lexerConf['casesensitive']) {
95 $wordString = mb_strtolower($wordString, 'utf-8');
96 }
97 // Now, splitting words:
98 $len = 0;
99 $start = 0;
100 $pos = 0;
101 $words = [];
102 $this->debugString = '';
103 while (1) {
104 list($start, $len) = $this->get_word($wordString, $pos);
105 if ($len) {
106 $this->addWords($words, $wordString, $start, $len);
107 if ($this->debug) {
108 $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr(
109 $wordString,
110 $pos,
111 $start - $pos
112 )) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
113 }
114 $pos = $start + $len;
115 } else {
116 break;
117 }
118 }
119 return $words;
120 }
121
122 /**********************************
123 *
124 * Helper functions
125 *
126 ********************************/
127 /**
128 * Add word to word-array
129 * This function should be used to make sure CJK sequences are split up in the right way
130 *
131 * @param array $words Array of accumulated words
132 * @param string $wordString Complete Input string from where to extract word
133 * @param int $start Start position of word in input string
134 * @param int $len The Length of the word string from start position
135 */
136 public function addWords(&$words, &$wordString, $start, $len)
137 {
138 // Get word out of string:
139 $theWord = substr($wordString, $start, $len);
140 // Get next chars unicode number and find type:
141 $bc = 0;
142 $cp = $this->utf8_ord($theWord, $bc);
143 list($cType) = $this->charType($cp);
144 // If string is a CJK sequence we follow this algorithm:
145 /*
146 DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
147 separate letters and numbers into words. This is sufficient for
148 all western text.CJK doesn't use spaces or separators to separate words, so the only
149 way to really find out what constitutes a word would be to have a
150 dictionary and advanced heuristics. Instead, we form pairs from
151 consecutive characters, in such a way that searches will find only
152 characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
153 in the same manner, and since the set of characters is huge so the
154 extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
155 */
156 if ($cType === 'cjk') {
157 // Find total string length:
158 $strlen = mb_strlen($theWord, 'utf-8');
159 // Traverse string length and add words as pairs of two chars:
160 for ($a = 0; $a < $strlen; $a++) {
161 if ($strlen == 1 || $a < $strlen - 1) {
162 $words[] = mb_substr($theWord, $a, 2, 'utf-8');
163 }
164 }
165 } else {
166 // Normal "single-byte" chars:
167 // Remove chars:
168 $charsetConverter = GeneralUtility::makeInstance(CharsetConverter::class);
169 foreach ($this->lexerConf['removeChars'] as $skipJoin) {
170 $theWord = str_replace($charsetConverter->UnumberToChar($skipJoin), '', $theWord);
171 }
172 // Add word:
173 $words[] = $theWord;
174 }
175 }
176
177 /**
178 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
179 *
180 * @param string $str Input string (reference)
181 * @param int $pos Starting position in input string
182 * @return array 0: start, 1: len or FALSE if no word has been found
183 */
184 public function get_word(&$str, $pos = 0)
185 {
186 $len = 0;
187 // If return is TRUE, a word was found starting at this position, so returning position and length:
188 if ($this->utf8_is_letter($str, $len, $pos)) {
189 return [$pos, $len];
190 }
191 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
192 $pos += $len;
193 if ($str[$pos] == '') {
194 // Check end of string before looking for word of course.
195 return false;
196 }
197 $this->utf8_is_letter($str, $len, $pos);
198 return [$pos, $len];
199 }
200
201 /**
202 * See if a character is a letter (or a string of letters or non-letters).
203 *
204 * @param string $str Input string (reference)
205 * @param int $len Byte-length of character sequence (reference, return value)
206 * @param int $pos Starting position in input string
207 * @return bool letter (or word) found
208 */
209 public function utf8_is_letter(&$str, &$len, $pos = 0)
210 {
211 $len = 0;
212 $bc = 0;
213 $cp = 0;
214 $printJoinLgd = 0;
215 $cType = ($cType_prev = false);
216 // Letter type
217 $letter = true;
218 // looking for a letter?
219 if ($str[$pos] == '') {
220 // Return FALSE on end-of-string at this stage
221 return false;
222 }
223 while (1) {
224 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
225 if ($len) {
226 if ($letter) {
227 // We are in a sequence of words
228 if (
229 !$cType
230 || $cType_prev === 'cjk' && ($cType === 'num' || $cType === 'alpha')
231 || $cType === 'cjk' && ($cType_prev === 'num' || $cType_prev === 'alpha')
232 ) {
233 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
234 if (!in_array($cp, $this->lexerConf['printjoins'])) {
235 // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
236 if ($printJoinLgd) {
237 $len = $printJoinLgd;
238 }
239 return true;
240 }
241 // If a printJoin char is found, record the length if it has not been recorded already:
242 if (!$printJoinLgd) {
243 $printJoinLgd = $len;
244 }
245 } else {
246 // When a true letter is found, reset printJoinLgd counter:
247 $printJoinLgd = 0;
248 }
249 } elseif (!$letter && $cType) {
250 // end of non-word reached
251 return false;
252 }
253 }
254 $len += $bc;
255 // add byte-length of last found character
256 if ($str[$pos] == '') {
257 // End of string; return status of string till now
258 return $letter;
259 }
260 // Get next chars unicode number:
261 $cp = $this->utf8_ord($str, $bc, $pos);
262 $pos += $bc;
263 // Determine the type:
264 $cType_prev = $cType;
265 list($cType) = $this->charType($cp);
266 if ($cType) {
267 continue;
268 }
269 // Setting letter to FALSE if the first char was not a letter!
270 if (!$len) {
271 $letter = false;
272 }
273 }
274 return false;
275 }
276
277 /**
278 * Determine the type of character
279 *
280 * @param int $cp Unicode number to evaluate
281 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
282 */
283 public function charType($cp)
284 {
285 // Numeric?
286 if ($cp >= 48 && $cp <= 57) {
287 return ['num'];
288 }
289 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
290 if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
291 return ['alpha'];
292 }
293 // Looking for CJK (Chinese / Japanese / Korean)
294 // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
295 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
296 if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
297 return ['cjk'];
298 }
299 }
300
301 /**
302 * Converts a UTF-8 multibyte character to a UNICODE codepoint
303 *
304 * @param string $str UTF-8 multibyte character string (reference)
305 * @param int $len The length of the character (reference, return value)
306 * @param int $pos Starting position in input string
307 * @param bool $hex If set, then a hex. number is returned
308 * @return int UNICODE codepoint
309 */
310 public function utf8_ord(&$str, &$len, $pos = 0, $hex = false)
311 {
312 $ord = ord($str[$pos]);
313 $len = 1;
314 if ($ord > 128) {
315 for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
316 // calculate number of extra bytes
317 $bc++;
318 }
319 $len += $bc;
320 $ord = $ord & (1 << 6 - $bc) - 1;
321 // mask utf-8 lead-in bytes
322 // "bring in" data bytes
323 for ($i = $pos + 1; $bc; $bc--, $i++) {
324 $ord = $ord << 6 | ord($str[$i]) & 63;
325 }
326 }
327 return $hex ? 'x' . dechex($ord) : $ord;
328 }
329 }