[TASK] Re-work/simplify copyright header in PHP files - Part 8
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Lexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /**
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16 /**
17 * Lexer for indexed_search
18 *
19 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
20 */
21 /**
22 * Lexer class for indexed_search
23 * A lexer splits the text into words
24 *
25 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
26 */
27 class Lexer {
28
29 // Debugging options:
30 /**
31 * @todo Define visibility
32 */
33 public $debug = FALSE;
34
35 // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
36 /**
37 * @todo Define visibility
38 */
39 public $debugString = '';
40
41 /**
42 * Charset class object
43 *
44 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
45 * @todo Define visibility
46 */
47 public $csObj;
48
49 // Configuration of the lexer:
50 /**
51 * @todo Define visibility
52 */
53 public $lexerConf = array(
54 'printjoins' => array(46, 45, 95, 58, 47, 39),
55 'casesensitive' => FALSE,
56 // Set, if case sensitive indexing is wanted.
57 'removeChars' => array(45)
58 );
59
60 /**
61 * Constructor: Initializes the charset class
62 *
63 * @return void
64 * @todo Define visibility
65 */
66 public function __construct() {
67 $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Charset\\CharsetConverter');
68 }
69
70 /**
71 * Splitting string into words.
72 * Used for indexing, can also be used to find words in query.
73 *
74 * @param string String with UTF-8 content to process.
75 * @return array Array of words in utf-8
76 * @todo Define visibility
77 */
78 public function split2Words($wordString) {
79 // Reset debug string:
80 $this->debugString = '';
81 // Then convert the string to lowercase:
82 if (!$this->lexerConf['casesensitive']) {
83 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
84 }
85 // Now, splitting words:
86 $len = 0;
87 $start = 0;
88 $pos = 0;
89 $words = array();
90 $this->debugString = '';
91 while (1) {
92 list($start, $len) = $this->get_word($wordString, $pos);
93 if ($len) {
94 $this->addWords($words, $wordString, $start, $len);
95 if ($this->debug) {
96 $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
97 }
98 $pos = $start + $len;
99 } else {
100 break;
101 }
102 }
103 return $words;
104 }
105
106 /**********************************
107 *
108 * Helper functions
109 *
110 ********************************/
111 /**
112 * Add word to word-array
113 * This function should be used to make sure CJK sequences are split up in the right way
114 *
115 * @param array Array of accumulated words
116 * @param string Complete Input string from where to extract word
117 * @param integer Start position of word in input string
118 * @param integer The Length of the word string from start position
119 * @return void
120 * @todo Define visibility
121 */
122 public function addWords(&$words, &$wordString, $start, $len) {
123 // Get word out of string:
124 $theWord = substr($wordString, $start, $len);
125 // Get next chars unicode number and find type:
126 $bc = 0;
127 $cp = $this->utf8_ord($theWord, $bc);
128 list($cType) = $this->charType($cp);
129 // If string is a CJK sequence we follow this algorithm:
130 /*
131 DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
132 separate letters and numbers into words. This is sufficient for
133 all western text.CJK doesn't use spaces or separators to separate words, so the only
134 way to really find out what constitutes a word would be to have a
135 dictionary and advanced heuristics. Instead, we form pairs from
136 consecutive characters, in such a way that searches will find only
137 characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
138 in the same manner, and since the set of characters is huge so the
139 extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
140 */
141 if ($cType == 'cjk') {
142 // Find total string length:
143 $strlen = $this->csObj->utf8_strlen($theWord);
144 // Traverse string length and add words as pairs of two chars:
145 for ($a = 0; $a < $strlen; $a++) {
146 if ($strlen == 1 || $a < $strlen - 1) {
147 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
148 }
149 }
150 } else {
151 // Normal "single-byte" chars:
152 // Remove chars:
153 foreach ($this->lexerConf['removeChars'] as $skipJoin) {
154 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
155 }
156 // Add word:
157 $words[] = $theWord;
158 }
159 }
160
161 /**
162 * Get the first word in a given utf-8 string (initial non-letters will be skipped)
163 *
164 * @param string Input string (reference)
165 * @param integer Starting position in input string
166 * @return array 0: start, 1: len or FALSE if no word has been found
167 * @todo Define visibility
168 */
169 public function get_word(&$str, $pos = 0) {
170 $len = 0;
171 // If return is TRUE, a word was found starting at this position, so returning position and length:
172 if ($this->utf8_is_letter($str, $len, $pos)) {
173 return array($pos, $len);
174 }
175 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
176 $pos += $len;
177 if ($str[$pos] == '') {
178 // Check end of string before looking for word of course.
179 return FALSE;
180 }
181 $this->utf8_is_letter($str, $len, $pos);
182 return array($pos, $len);
183 }
184
185 /**
186 * See if a character is a letter (or a string of letters or non-letters).
187 *
188 * @param string Input string (reference)
189 * @param integer Byte-length of character sequence (reference, return value)
190 * @param integer Starting position in input string
191 * @return boolean letter (or word) found
192 * @todo Define visibility
193 */
194 public function utf8_is_letter(&$str, &$len, $pos = 0) {
195 global $cs;
196 $len = 0;
197 $bc = 0;
198 $cType = ($cType_prev = FALSE);
199 // Letter type
200 $letter = TRUE;
201 // looking for a letter?
202 if ($str[$pos] == '') {
203 // Return FALSE on end-of-string at this stage
204 return FALSE;
205 }
206 while (1) {
207 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
208 if ($len) {
209 if ($letter) {
210 // We are in a sequence of words
211 if (!$cType || $cType_prev == 'cjk' && \TYPO3\CMS\Core\Utility\GeneralUtility::inList('num,alpha', $cType) || $cType == 'cjk' && \TYPO3\CMS\Core\Utility\GeneralUtility::inList('num,alpha', $cType_prev)) {
212 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
213 if (!in_array($cp, $this->lexerConf['printjoins'])) {
214 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
215 if ($printJoinLgd) {
216 $len = $printJoinLgd;
217 }
218 return TRUE;
219 } else {
220 // If a printJoin char is found, record the length if it has not been recorded already:
221 if (!$printJoinLgd) {
222 $printJoinLgd = $len;
223 }
224 }
225 } else {
226 // When a true letter is found, reset printJoinLgd counter:
227 $printJoinLgd = 0;
228 }
229 } elseif (!$letter && $cType) {
230 // end of non-word reached
231 return FALSE;
232 }
233 }
234 $len += $bc;
235 // add byte-length of last found character
236 if ($str[$pos] == '') {
237 // End of string; return status of string till now
238 return $letter;
239 }
240 // Get next chars unicode number:
241 $cp = $this->utf8_ord($str, $bc, $pos);
242 $pos += $bc;
243 // Determine the type:
244 $cType_prev = $cType;
245 list($cType) = $this->charType($cp);
246 if ($cType) {
247 continue;
248 }
249 // Setting letter to FALSE if the first char was not a letter!
250 if (!$len) {
251 $letter = FALSE;
252 }
253 }
254 return FALSE;
255 }
256
257 /**
258 * Determine the type of character
259 *
260 * @param integer Unicode number to evaluate
261 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
262 * @todo Define visibility
263 */
264 public function charType($cp) {
265 // Numeric?
266 if ($cp >= 48 && $cp <= 57) {
267 return array('num');
268 }
269 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
270 if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
271 return array('alpha');
272 }
273 // Looking for CJK (Chinese / Japanese / Korean)
274 // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
275 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
276 if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
277 return array('cjk');
278 }
279 }
280
281 /**
282 * Converts a UTF-8 multibyte character to a UNICODE codepoint
283 *
284 * @param string UTF-8 multibyte character string (reference)
285 * @param integer The length of the character (reference, return value)
286 * @param integer Starting position in input string
287 * @param boolean If set, then a hex. number is returned
288 * @return integer UNICODE codepoint
289 * @todo Define visibility
290 */
291 public function utf8_ord(&$str, &$len, $pos = 0, $hex = FALSE) {
292 $ord = ord($str[$pos]);
293 $len = 1;
294 if ($ord > 128) {
295 for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
296 // calculate number of extra bytes
297 $bc++;
298 }
299 $len += $bc;
300 $ord = $ord & (1 << 6 - $bc) - 1;
301 // mask utf-8 lead-in bytes
302 // "bring in" data bytes
303 for ($i = $pos + 1; $bc; $bc--, $i++) {
304 $ord = $ord << 6 | ord($str[$i]) & 63;
305 }
306 }
307 return $hex ? 'x' . dechex($ord) : $ord;
308 }
309
310 }