[TASK] Fix dummy parameter and return texts in phpDoc
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Lexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 /**
18 * Lexer for indexed_search
19 *
20 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
21 */
22 /**
23 * Lexer class for indexed_search
24 * A lexer splits the text into words
25 *
26 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
27 */
28 class Lexer {
29
30 // Debugging options:
31 /**
32 * @var bool
33 */
34 public $debug = FALSE;
35
36 // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
37 /**
38 * @var string
39 */
40 public $debugString = '';
41
42 /**
43 * Charset class object
44 *
45 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
46 */
47 public $csObj;
48
49 // Configuration of the lexer:
50 /**
51 * @var array
52 */
53 public $lexerConf = array(
54 'printjoins' => array(46, 45, 95, 58, 47, 39),
55 'casesensitive' => FALSE,
56 // Set, if case sensitive indexing is wanted.
57 'removeChars' => array(45)
58 );
59
60 /**
61 * Constructor: Initializes the charset class
62 *
63 * @return void
64 */
65 public function __construct() {
66 $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
67 }
68
69 /**
70 * Splitting string into words.
71 * Used for indexing, can also be used to find words in query.
72 *
73 * @param string String with UTF-8 content to process.
74 * @return array Array of words in utf-8
75 */
76 public function split2Words($wordString) {
77 // Reset debug string:
78 $this->debugString = '';
79 // Then convert the string to lowercase:
80 if (!$this->lexerConf['casesensitive']) {
81 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
82 }
83 // Now, splitting words:
84 $len = 0;
85 $start = 0;
86 $pos = 0;
87 $words = array();
88 $this->debugString = '';
89 while (1) {
90 list($start, $len) = $this->get_word($wordString, $pos);
91 if ($len) {
92 $this->addWords($words, $wordString, $start, $len);
93 if ($this->debug) {
94 $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
95 }
96 $pos = $start + $len;
97 } else {
98 break;
99 }
100 }
101 return $words;
102 }
103
104 /**********************************
105 *
106 * Helper functions
107 *
108 ********************************/
109 /**
110 * Add word to word-array
111 * This function should be used to make sure CJK sequences are split up in the right way
112 *
113 * @param array Array of accumulated words
114 * @param string Complete Input string from where to extract word
115 * @param int Start position of word in input string
116 * @param int The Length of the word string from start position
117 * @return void
118 */
119 public function addWords(&$words, &$wordString, $start, $len) {
120 // Get word out of string:
121 $theWord = substr($wordString, $start, $len);
122 // Get next chars unicode number and find type:
123 $bc = 0;
124 $cp = $this->utf8_ord($theWord, $bc);
125 list($cType) = $this->charType($cp);
126 // If string is a CJK sequence we follow this algorithm:
127 /*
128 DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
129 separate letters and numbers into words. This is sufficient for
130 all western text.CJK doesn't use spaces or separators to separate words, so the only
131 way to really find out what constitutes a word would be to have a
132 dictionary and advanced heuristics. Instead, we form pairs from
133 consecutive characters, in such a way that searches will find only
134 characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
135 in the same manner, and since the set of characters is huge so the
136 extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
137 */
138 if ($cType == 'cjk') {
139 // Find total string length:
140 $strlen = $this->csObj->utf8_strlen($theWord);
141 // Traverse string length and add words as pairs of two chars:
142 for ($a = 0; $a < $strlen; $a++) {
143 if ($strlen == 1 || $a < $strlen - 1) {
144 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
145 }
146 }
147 } else {
148 // Normal "single-byte" chars:
149 // Remove chars:
150 foreach ($this->lexerConf['removeChars'] as $skipJoin) {
151 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
152 }
153 // Add word:
154 $words[] = $theWord;
155 }
156 }
157
158 /**
159 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
160 *
161 * @param string Input string (reference)
162 * @param int Starting position in input string
163 * @return array 0: start, 1: len or FALSE if no word has been found
164 */
165 public function get_word(&$str, $pos = 0) {
166 $len = 0;
167 // If return is TRUE, a word was found starting at this position, so returning position and length:
168 if ($this->utf8_is_letter($str, $len, $pos)) {
169 return array($pos, $len);
170 }
171 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
172 $pos += $len;
173 if ($str[$pos] == '') {
174 // Check end of string before looking for word of course.
175 return FALSE;
176 }
177 $this->utf8_is_letter($str, $len, $pos);
178 return array($pos, $len);
179 }
180
181 /**
182 * See if a character is a letter (or a string of letters or non-letters).
183 *
184 * @param string Input string (reference)
185 * @param int Byte-length of character sequence (reference, return value)
186 * @param int Starting position in input string
187 * @return bool letter (or word) found
188 */
189 public function utf8_is_letter(&$str, &$len, $pos = 0) {
190 global $cs;
191 $len = 0;
192 $bc = 0;
193 $cType = ($cType_prev = FALSE);
194 // Letter type
195 $letter = TRUE;
196 // looking for a letter?
197 if ($str[$pos] == '') {
198 // Return FALSE on end-of-string at this stage
199 return FALSE;
200 }
201 while (1) {
202 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
203 if ($len) {
204 if ($letter) {
205 // We are in a sequence of words
206 if (!$cType || $cType_prev == 'cjk' && \TYPO3\CMS\Core\Utility\GeneralUtility::inList('num,alpha', $cType) || $cType == 'cjk' && \TYPO3\CMS\Core\Utility\GeneralUtility::inList('num,alpha', $cType_prev)) {
207 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
208 if (!in_array($cp, $this->lexerConf['printjoins'])) {
209 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
210 if ($printJoinLgd) {
211 $len = $printJoinLgd;
212 }
213 return TRUE;
214 } else {
215 // If a printJoin char is found, record the length if it has not been recorded already:
216 if (!$printJoinLgd) {
217 $printJoinLgd = $len;
218 }
219 }
220 } else {
221 // When a true letter is found, reset printJoinLgd counter:
222 $printJoinLgd = 0;
223 }
224 } elseif (!$letter && $cType) {
225 // end of non-word reached
226 return FALSE;
227 }
228 }
229 $len += $bc;
230 // add byte-length of last found character
231 if ($str[$pos] == '') {
232 // End of string; return status of string till now
233 return $letter;
234 }
235 // Get next chars unicode number:
236 $cp = $this->utf8_ord($str, $bc, $pos);
237 $pos += $bc;
238 // Determine the type:
239 $cType_prev = $cType;
240 list($cType) = $this->charType($cp);
241 if ($cType) {
242 continue;
243 }
244 // Setting letter to FALSE if the first char was not a letter!
245 if (!$len) {
246 $letter = FALSE;
247 }
248 }
249 return FALSE;
250 }
251
252 /**
253 * Determine the type of character
254 *
255 * @param int Unicode number to evaluate
256 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
257 */
258 public function charType($cp) {
259 // Numeric?
260 if ($cp >= 48 && $cp <= 57) {
261 return array('num');
262 }
263 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
264 if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
265 return array('alpha');
266 }
267 // Looking for CJK (Chinese / Japanese / Korean)
268 // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
269 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
270 if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
271 return array('cjk');
272 }
273 }
274
275 /**
276 * Converts a UTF-8 multibyte character to a UNICODE codepoint
277 *
278 * @param string UTF-8 multibyte character string (reference)
279 * @param int The length of the character (reference, return value)
280 * @param int Starting position in input string
281 * @param bool If set, then a hex. number is returned
282 * @return int UNICODE codepoint
283 */
284 public function utf8_ord(&$str, &$len, $pos = 0, $hex = FALSE) {
285 $ord = ord($str[$pos]);
286 $len = 1;
287 if ($ord > 128) {
288 for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
289 // calculate number of extra bytes
290 $bc++;
291 }
292 $len += $bc;
293 $ord = $ord & (1 << 6 - $bc) - 1;
294 // mask utf-8 lead-in bytes
295 // "bring in" data bytes
296 for ($i = $pos + 1; $bc; $bc--, $i++) {
297 $ord = $ord << 6 | ord($str[$i]) & 63;
298 }
299 }
300 return $hex ? 'x' . dechex($ord) : $ord;
301 }
302
303 }