5524fb1838c9b02670f6503daf39c31f68ef8c0d
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Html / HtmlParser.php
1 <?php
2 namespace TYPO3\CMS\Core\Html;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18 use TYPO3\CMS\Core\Utility\MathUtility;
19 use TYPO3\CMS\Frontend\ContentObject\ContentObjectRenderer;
20
21 /**
22 * Functions for parsing HTML.
23 * You are encouraged to use this class in your own applications
24 */
25 class HtmlParser
26 {
27 /**
28 * @var array
29 */
30 protected $caseShift_cache = [];
31
32 // Void elements that do not have closing tags, as defined by HTML5, except link element
33 const VOID_ELEMENTS = 'area|base|br|col|command|embed|hr|img|input|keygen|meta|param|source|track|wbr';
34
35 /************************************
36 *
37 * Parsing HTML code
38 *
39 ************************************/
40 /**
41 * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
42 * Even numbers in the array are outside the blocks, Odd numbers are block-content.
43 * Use ->removeFirstAndLastTag() to process the content if needed.
44 *
45 * @param string $tag List of tags, comma separated.
46 * @param string $content HTML-content
47 * @param bool $eliminateExtraEndTags If set, excessive end tags are ignored - you should probably set this in most cases.
48 * @return array Even numbers in the array are outside the blocks, Odd numbers are block-content.
49 * @see splitTags(), removeFirstAndLastTag()
50 */
51 public function splitIntoBlock($tag, $content, $eliminateExtraEndTags = false)
52 {
53 $tags = array_unique(GeneralUtility::trimExplode(',', $tag, true));
54 array_walk($tags, function (&$tag) {
55 $tag = preg_quote($tag, '/');
56 });
57 $regexStr = '/\\<\\/?(' . implode('|', $tags) . ')(\\s*\\>|\\s[^\\>]*\\>)/si';
58 $parts = preg_split($regexStr, $content);
59 $newParts = [];
60 $pointer = strlen($parts[0]);
61 $buffer = $parts[0];
62 $nested = 0;
63 reset($parts);
64 // We skip the first element in foreach loop
65 $partsSliced = array_slice($parts, 1, null, true);
66 foreach ($partsSliced as $v) {
67 $isEndTag = substr($content, $pointer, 2) === '</';
68 $tagLen = strcspn(substr($content, $pointer), '>') + 1;
69 // We meet a start-tag:
70 if (!$isEndTag) {
71 // Ground level:
72 if (!$nested) {
73 // Previous buffer stored
74 $newParts[] = $buffer;
75 $buffer = '';
76 }
77 // We are inside now!
78 $nested++;
79 // New buffer set and pointer increased
80 $mbuffer = substr($content, $pointer, strlen($v) + $tagLen);
81 $pointer += strlen($mbuffer);
82 $buffer .= $mbuffer;
83 } else {
84 // If we meet an endtag:
85 // Decrease nested-level
86 $nested--;
87 $eliminated = 0;
88 if ($eliminateExtraEndTags && $nested < 0) {
89 $nested = 0;
90 $eliminated = 1;
91 } else {
92 // In any case, add the endtag to current buffer and increase pointer
93 $buffer .= substr($content, $pointer, $tagLen);
94 }
95 $pointer += $tagLen;
96 // if we're back on ground level, (and not by eliminating tags...
97 if (!$nested && !$eliminated) {
98 $newParts[] = $buffer;
99 $buffer = '';
100 }
101 // New buffer set and pointer increased
102 $mbuffer = substr($content, $pointer, strlen($v));
103 $pointer += strlen($mbuffer);
104 $buffer .= $mbuffer;
105 }
106 }
107 $newParts[] = $buffer;
108 return $newParts;
109 }
110
111 /**
112 * Splitting content into blocks *recursively* and processing tags/content with call back functions.
113 *
114 * @param string $tag Tag list, see splitIntoBlock()
115 * @param string $content Content, see splitIntoBlock()
116 * @param object $procObj Object where call back methods are.
117 * @param string $callBackContent Name of call back method for content; "function callBackContent($str,$level)
118 * @param string $callBackTags Name of call back method for tags; "function callBackTags($tags,$level)
119 * @param int $level Indent level
120 * @return string Processed content
121 * @see splitIntoBlock()
122 */
123 public function splitIntoBlockRecursiveProc($tag, $content, &$procObj, $callBackContent, $callBackTags, $level = 0)
124 {
125 $parts = $this->splitIntoBlock($tag, $content, true);
126 foreach ($parts as $k => $v) {
127 if ($k % 2) {
128 $firstTagName = $this->getFirstTagName($v, true);
129 $tagsArray = [];
130 $tagsArray['tag_start'] = $this->getFirstTag($v);
131 $tagsArray['tag_end'] = '</' . $firstTagName . '>';
132 $tagsArray['tag_name'] = strtolower($firstTagName);
133 $tagsArray['content'] = $this->splitIntoBlockRecursiveProc($tag, $this->removeFirstAndLastTag($v), $procObj, $callBackContent, $callBackTags, $level + 1);
134 if ($callBackTags) {
135 $tagsArray = $procObj->{$callBackTags}($tagsArray, $level);
136 }
137 $parts[$k] = $tagsArray['tag_start'] . $tagsArray['content'] . $tagsArray['tag_end'];
138 } else {
139 if ($callBackContent) {
140 $parts[$k] = $procObj->{$callBackContent}($parts[$k], $level);
141 }
142 }
143 }
144 return implode('', $parts);
145 }
146
147 /**
148 * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
149 * Even numbers in the array are outside the blocks, Odd numbers are block-content.
150 * Use ->removeFirstAndLastTag() to process the content if needed.
151 *
152 * @param string $tag List of tags
153 * @param string $content HTML-content
154 * @return array Even numbers in the array are outside the blocks, Odd numbers are block-content.
155 * @see splitIntoBlock(), removeFirstAndLastTag()
156 */
157 public function splitTags($tag, $content)
158 {
159 $tags = GeneralUtility::trimExplode(',', $tag, true);
160 array_walk($tags, function (&$tag) {
161 $tag = preg_quote($tag, '/');
162 });
163 $regexStr = '/\\<(' . implode('|', $tags) . ')(\\s[^>]*)?\\/?>/si';
164 $parts = preg_split($regexStr, $content);
165 $pointer = strlen($parts[0]);
166 $newParts = [];
167 $newParts[] = $parts[0];
168 reset($parts);
169 // We skip the first element in foreach loop
170 $partsSliced = array_slice($parts, 1, null, true);
171 foreach ($partsSliced as $v) {
172 $tagLen = strcspn(substr($content, $pointer), '>') + 1;
173 // Set tag:
174 // New buffer set and pointer increased
175 $tag = substr($content, $pointer, $tagLen);
176 $newParts[] = $tag;
177 $pointer += strlen($tag);
178 // Set content:
179 $newParts[] = $v;
180 $pointer += strlen($v);
181 }
182 return $newParts;
183 }
184
185 /**
186 * Removes the first and last tag in the string
187 * Anything before the first and after the last tags respectively is also removed
188 *
189 * @param string $str String to process
190 * @return string
191 */
192 public function removeFirstAndLastTag($str)
193 {
194 // End of first tag:
195 $start = strpos($str, '>');
196 // Begin of last tag:
197 $end = strrpos($str, '<');
198 // Return
199 return substr($str, $start + 1, $end - $start - 1);
200 }
201
202 /**
203 * Returns the first tag in $str
204 * Actually everything from the beginning of the $str is returned, so you better make sure the tag is the first thing...
205 *
206 * @param string $str HTML string with tags
207 * @return string
208 */
209 public function getFirstTag($str)
210 {
211 // First:
212 $endLen = strpos($str, '>');
213 return $endLen !== false ? substr($str, 0, $endLen + 1) : '';
214 }
215
216 /**
217 * Returns the NAME of the first tag in $str
218 *
219 * @param string $str HTML tag (The element name MUST be separated from the attributes by a space character! Just *whitespace* will not do)
220 * @param bool $preserveCase If set, then the tag is NOT converted to uppercase by case is preserved.
221 * @return string Tag name in upper case
222 * @see getFirstTag()
223 */
224 public function getFirstTagName($str, $preserveCase = false)
225 {
226 $matches = [];
227 if (preg_match('/^\\s*\\<([^\\s\\>]+)(\\s|\\>)/', $str, $matches) === 1) {
228 if (!$preserveCase) {
229 return strtoupper($matches[1]);
230 }
231 return $matches[1];
232 }
233 return '';
234 }
235
236 /**
237 * Returns an array with all attributes as keys. Attributes are only lowercase a-z
238 * If an attribute is empty (shorthand), then the value for the key is empty. You can check if it existed with isset()
239 *
240 * Compared to the method in GeneralUtility::get_tag_attributes this method also returns meta data about each
241 * attribute, e.g. if it is a shorthand attribute, and what the quotation is. Also, since all attribute keys
242 * are lower-cased, the meta information contains the original attribute name.
243 *
244 * @param string $tag Tag: $tag is either a whole tag (eg '<TAG OPTION ATTRIB=VALUE>') or the parameterlist (ex ' OPTION ATTRIB=VALUE>')
245 * @param bool $deHSC If set, the attribute values are de-htmlspecialchar'ed. Should actually always be set!
246 * @return array array(Tag attributes,Attribute meta-data)
247 */
248 public function get_tag_attributes($tag, $deHSC = false)
249 {
250 list($components, $metaC) = $this->split_tag_attributes($tag);
251 // Attribute name is stored here
252 $name = '';
253 $valuemode = false;
254 $attributes = [];
255 $attributesMeta = [];
256 if (is_array($components)) {
257 foreach ($components as $key => $val) {
258 // Only if $name is set (if there is an attribute, that waits for a value), that valuemode is enabled. This ensures that the attribute is assigned it's value
259 if ($val !== '=') {
260 if ($valuemode) {
261 if ($name) {
262 $attributes[$name] = $deHSC ? htmlspecialchars_decode($val) : $val;
263 $attributesMeta[$name]['dashType'] = $metaC[$key];
264 $name = '';
265 }
266 } else {
267 if ($namekey = preg_replace('/[^[:alnum:]_\\:\\-]/', '', $val)) {
268 $name = strtolower($namekey);
269 $attributesMeta[$name] = [];
270 $attributesMeta[$name]['origTag'] = $namekey;
271 $attributes[$name] = '';
272 }
273 }
274 $valuemode = false;
275 } else {
276 $valuemode = true;
277 }
278 }
279 return [$attributes, $attributesMeta];
280 }
281 }
282
283 /**
284 * Returns an array with the 'components' from an attribute list.
285 * The result is normally analyzed by get_tag_attributes
286 * Removes tag-name if found.
287 *
288 * The difference between this method and the one in GeneralUtility is that this method actually determines
289 * more information on the attribute, e.g. if the value is enclosed by a " or ' character.
290 * That's why this method returns two arrays, the "components" and the "meta-information" of the "components".
291 *
292 * @param string $tag The tag or attributes
293 * @return array
294 * @access private
295 * @see \TYPO3\CMS\Core\Utility\GeneralUtility::split_tag_attributes()
296 */
297 public function split_tag_attributes($tag)
298 {
299 $matches = [];
300 if (preg_match('/(\\<[^\\s]+\\s+)?(.*?)\\s*(\\>)?$/s', $tag, $matches) !== 1) {
301 return [[], []];
302 }
303 $tag_tmp = $matches[2];
304 $metaValue = [];
305 $value = [];
306 $matches = [];
307 if (preg_match_all('/("[^"]*"|\'[^\']*\'|[^\\s"\'\\=]+|\\=)/s', $tag_tmp, $matches) > 0) {
308 foreach ($matches[1] as $part) {
309 $firstChar = $part[0];
310 if ($firstChar === '"' || $firstChar === '\'') {
311 $metaValue[] = $firstChar;
312 $value[] = substr($part, 1, -1);
313 } else {
314 $metaValue[] = '';
315 $value[] = $part;
316 }
317 }
318 }
319 return [$value, $metaValue];
320 }
321
322 /*********************************
323 *
324 * Clean HTML code
325 *
326 *********************************/
327 /**
328 * Function that can clean up HTML content according to configuration given in the $tags array.
329 *
330 * Initializing the $tags array to allow a list of tags (in this case <B>,<I>,<U> and <A>), set it like this: $tags = array_flip(explode(',','b,a,i,u'))
331 * If the value of the $tags[$tagname] entry is an array, advanced processing of the tags is initialized. These are the options:
332 *
333 * $tags[$tagname] = Array(
334 * 'overrideAttribs' => '' If set, this string is preset as the attributes of the tag
335 * 'allowedAttribs' => '0' (zero) = no attributes allowed, '[commalist of attributes]' = only allowed attributes. If blank, all attributes are allowed.
336 * 'fixAttrib' => Array(
337 * '[attribute name]' => Array (
338 * 'set' => Force the attribute value to this value.
339 * 'unset' => Boolean: If set, the attribute is unset.
340 * 'default' => If no attribute exists by this name, this value is set as default value (if this value is not blank)
341 * 'always' => Boolean. If set, the attribute is always processed. Normally an attribute is processed only if it exists
342 * 'trim,intval,lower,upper' => All booleans. If any of these keys are set, the value is passed through the respective PHP-functions.
343 * 'range' => Array ('[low limit]','[high limit, optional]') Setting integer range.
344 * 'list' => Array ('[value1/default]','[value2]','[value3]') Attribute must be in this list. If not, the value is set to the first element.
345 * 'removeIfFalse' => Boolean/'blank'. If set, then the attribute is removed if it is 'FALSE'. If this value is set to 'blank' then the value must be a blank string (that means a 'zero' value will not be removed)
346 * 'removeIfEquals' => [value] If the attribute value matches the value set here, then it is removed.
347 * 'casesensitiveComp' => 1 If set, then the removeIfEquals and list comparisons will be case sensitive. Otherwise not.
348 * )
349 * ),
350 * 'protect' => '', Boolean. If set, the tag <> is converted to &lt; and &gt;
351 * 'remap' => '', String. If set, the tagname is remapped to this tagname
352 * 'rmTagIfNoAttrib' => '', Boolean. If set, then the tag is removed if no attributes happened to be there.
353 * 'nesting' => '', Boolean/'global'. If set TRUE, then this tag must have starting and ending tags in the correct order. Any tags not in this order will be discarded. Thus '</B><B><I></B></I></B>' will be converted to '<B><I></B></I>'. Is the value 'global' then true nesting in relation to other tags marked for 'global' nesting control is preserved. This means that if <B> and <I> are set for global nesting then this string '</B><B><I></B></I></B>' is converted to '<B></B>'
354 * )
355 *
356 * @param string $content Is the HTML-content being processed. This is also the result being returned.
357 * @param array $tags Is an array where each key is a tagname in lowercase. Only tags present as keys in this array are preserved. The value of the key can be an array with a vast number of options to configure.
358 * @param mixed $keepAll Boolean/'protect', if set, then all tags are kept regardless of tags present as keys in $tags-array. If 'protect' then the preserved tags have their <> converted to &lt; and &gt;
359 * @param int $hSC Values -1,0,1,2: Set to zero= disabled, set to 1 then the content BETWEEN tags is htmlspecialchar()'ed, set to -1 its the opposite and set to 2 the content will be HSC'ed BUT with preservation for real entities (eg. "&amp;" or "&#234;")
360 * @param array $addConfig Configuration array send along as $conf to the internal functions
361 * @return string Processed HTML content
362 */
363 public function HTMLcleaner($content, $tags = [], $keepAll = 0, $hSC = 0, $addConfig = [])
364 {
365 $newContent = [];
366 $tokArr = explode('<', $content);
367 $newContent[] = $this->bidir_htmlspecialchars(current($tokArr), $hSC);
368 // We skip the first element in foreach loop
369 $tokArrSliced = array_slice($tokArr, 1, null, true);
370 $c = 1;
371 $tagRegister = [];
372 $tagStack = [];
373 $inComment = false;
374 $inCdata = false;
375 $skipTag = false;
376 foreach ($tokArrSliced as $tok) {
377 if ($inComment) {
378 if (($eocPos = strpos($tok, '-->')) === false) {
379 // End of comment is not found in the token. Go further until end of comment is found in other tokens.
380 $newContent[$c++] = '<' . $tok;
381 continue;
382 }
383 // Comment ends in the middle of the token: add comment and proceed with rest of the token
384 $newContent[$c++] = '<' . substr($tok, 0, ($eocPos + 3));
385 $tok = substr($tok, $eocPos + 3);
386 $inComment = false;
387 $skipTag = true;
388 } elseif ($inCdata) {
389 if (($eocPos = strpos($tok, '/*]]>*/')) === false) {
390 // End of comment is not found in the token. Go further until end of comment is found in other tokens.
391 $newContent[$c++] = '<' . $tok;
392 continue;
393 }
394 // Comment ends in the middle of the token: add comment and proceed with rest of the token
395 $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 10);
396 $tok = substr($tok, $eocPos + 10);
397 $inCdata = false;
398 $skipTag = true;
399 } elseif (substr($tok, 0, 3) === '!--') {
400 if (($eocPos = strpos($tok, '-->')) === false) {
401 // Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment
402 $newContent[$c++] = '<' . $tok;
403 $inComment = true;
404 continue;
405 }
406 // Start and end of comment are both in the current token. Add comment and proceed with rest of the token
407 $newContent[$c++] = '<' . substr($tok, 0, ($eocPos + 3));
408 $tok = substr($tok, $eocPos + 3);
409 $skipTag = true;
410 } elseif (substr($tok, 0, 10) === '![CDATA[*/') {
411 if (($eocPos = strpos($tok, '/*]]>*/')) === false) {
412 // Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment
413 $newContent[$c++] = '<' . $tok;
414 $inCdata = true;
415 continue;
416 }
417 // Start and end of comment are both in the current token. Add comment and proceed with rest of the token
418 $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 10);
419 $tok = substr($tok, $eocPos + 10);
420 $skipTag = true;
421 }
422 $firstChar = $tok[0];
423 // It is a tag... (first char is a-z0-9 or /) (fixed 19/01 2004). This also avoids triggering on <?xml..> and <!DOCTYPE..>
424 if (!$skipTag && preg_match('/[[:alnum:]\\/]/', $firstChar) == 1) {
425 $tagEnd = strpos($tok, '>');
426 // If there is and end-bracket... tagEnd can't be 0 as the first character can't be a >
427 if ($tagEnd) {
428 $endTag = $firstChar === '/' ? 1 : 0;
429 $tagContent = substr($tok, $endTag, $tagEnd - $endTag);
430 $tagParts = preg_split('/\\s+/s', $tagContent, 2);
431 $tagName = strtolower($tagParts[0]);
432 $emptyTag = 0;
433 if (isset($tags[$tagName])) {
434 // If there is processing to do for the tag:
435 if (is_array($tags[$tagName])) {
436 if (preg_match('/^(' . self::VOID_ELEMENTS . ' )$/i', $tagName)) {
437 $emptyTag = 1;
438 }
439 // If NOT an endtag, do attribute processing (added dec. 2003)
440 if (!$endTag) {
441 // Override attributes
442 if ((string)$tags[$tagName]['overrideAttribs'] !== '') {
443 $tagParts[1] = $tags[$tagName]['overrideAttribs'];
444 }
445 // Allowed tags
446 if ((string)$tags[$tagName]['allowedAttribs'] !== '') {
447 // No attribs allowed
448 if ((string)$tags[$tagName]['allowedAttribs'] === '0') {
449 $tagParts[1] = '';
450 } elseif (trim($tagParts[1])) {
451 $tagAttrib = $this->get_tag_attributes($tagParts[1]);
452 $tagParts[1] = '';
453 $newTagAttrib = [];
454 if (!($tList = $tags[$tagName]['_allowedAttribs'])) {
455 // Just explode attribts for tag once
456 $tList = ($tags[$tagName]['_allowedAttribs'] = GeneralUtility::trimExplode(',', strtolower($tags[$tagName]['allowedAttribs']), true));
457 }
458 foreach ($tList as $allowTag) {
459 if (isset($tagAttrib[0][$allowTag])) {
460 $newTagAttrib[$allowTag] = $tagAttrib[0][$allowTag];
461 }
462 }
463 $tagParts[1] = $this->compileTagAttribs($newTagAttrib, $tagAttrib[1]);
464 }
465 }
466 // Fixed attrib values
467 if (is_array($tags[$tagName]['fixAttrib'])) {
468 $tagAttrib = $this->get_tag_attributes($tagParts[1]);
469 $tagParts[1] = '';
470 foreach ($tags[$tagName]['fixAttrib'] as $attr => $params) {
471 if (isset($params['set']) && $params['set'] !== '') {
472 $tagAttrib[0][$attr] = $params['set'];
473 }
474 if (!empty($params['unset'])) {
475 unset($tagAttrib[0][$attr]);
476 }
477 if (!isset($tagAttrib[0][$attr]) && (string)$params['default'] !== '') {
478 $tagAttrib[0][$attr] = $params['default'];
479 }
480 if ($params['always'] || isset($tagAttrib[0][$attr])) {
481 if ($params['trim']) {
482 $tagAttrib[0][$attr] = trim($tagAttrib[0][$attr]);
483 }
484 if ($params['intval']) {
485 $tagAttrib[0][$attr] = (int)$tagAttrib[0][$attr];
486 }
487 if ($params['lower']) {
488 $tagAttrib[0][$attr] = strtolower($tagAttrib[0][$attr]);
489 }
490 if ($params['upper']) {
491 $tagAttrib[0][$attr] = strtoupper($tagAttrib[0][$attr]);
492 }
493 if ($params['range']) {
494 if (isset($params['range'][1])) {
495 $tagAttrib[0][$attr] = MathUtility::forceIntegerInRange($tagAttrib[0][$attr], (int)$params['range'][0], (int)$params['range'][1]);
496 } else {
497 $tagAttrib[0][$attr] = MathUtility::forceIntegerInRange($tagAttrib[0][$attr], (int)$params['range'][0]);
498 }
499 }
500 if (is_array($params['list'])) {
501 // For the class attribute, remove from the attribute value any class not in the list
502 // Classes are case sensitive
503 if ($attr === 'class') {
504 $newClasses = [];
505 $classes = GeneralUtility::trimExplode(' ', $tagAttrib[0][$attr], true);
506 foreach ($classes as $class) {
507 if (in_array($class, $params['list'])) {
508 $newClasses[] = $class;
509 }
510 }
511 if (!empty($newClasses)) {
512 $tagAttrib[0][$attr] = implode(' ', $newClasses);
513 } else {
514 $tagAttrib[0][$attr] = $params['list'][0];
515 }
516 } else {
517 if (!in_array($this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']), $this->caseShift($params['list'], $params['casesensitiveComp'], $tagName))) {
518 $tagAttrib[0][$attr] = $params['list'][0];
519 }
520 }
521 }
522 if ($params['removeIfFalse'] && $params['removeIfFalse'] !== 'blank' && !$tagAttrib[0][$attr] || $params['removeIfFalse'] === 'blank' && (string)$tagAttrib[0][$attr] === '') {
523 unset($tagAttrib[0][$attr]);
524 }
525 if ((string)$params['removeIfEquals'] !== '' && $this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']) === $this->caseShift($params['removeIfEquals'], $params['casesensitiveComp'])) {
526 unset($tagAttrib[0][$attr]);
527 }
528 if ($params['prefixLocalAnchors']) {
529 if ($tagAttrib[0][$attr][0] === '#') {
530 if ($params['prefixLocalAnchors'] == 2) {
531 /** @var ContentObjectRenderer $contentObjectRenderer */
532 $contentObjectRenderer = GeneralUtility::makeInstance(ContentObjectRenderer::class);
533 $prefix = $contentObjectRenderer->getUrlToCurrentLocation();
534 } else {
535 $prefix = GeneralUtility::getIndpEnv('TYPO3_REQUEST_URL');
536 }
537 $tagAttrib[0][$attr] = $prefix . $tagAttrib[0][$attr];
538 }
539 }
540 if ($params['prefixRelPathWith']) {
541 $urlParts = parse_url($tagAttrib[0][$attr]);
542 if (!$urlParts['scheme'] && $urlParts['path'][0] !== '/') {
543 // If it is NOT an absolute URL (by http: or starting "/")
544 $tagAttrib[0][$attr] = $params['prefixRelPathWith'] . $tagAttrib[0][$attr];
545 }
546 }
547 if ($params['userFunc']) {
548 if (is_array($params['userFunc.'])) {
549 $params['userFunc.']['attributeValue'] = $tagAttrib[0][$attr];
550 } else {
551 $params['userFunc.'] = $tagAttrib[0][$attr];
552 }
553 $tagAttrib[0][$attr] = GeneralUtility::callUserFunction($params['userFunc'], $params['userFunc.'], $this);
554 }
555 }
556 }
557 $tagParts[1] = $this->compileTagAttribs($tagAttrib[0], $tagAttrib[1]);
558 }
559 } else {
560 // If endTag, remove any possible attributes:
561 $tagParts[1] = '';
562 }
563 // Protecting the tag by converting < and > to &lt; and &gt; ??
564 if ($tags[$tagName]['protect']) {
565 $lt = '&lt;';
566 $gt = '&gt;';
567 } else {
568 $lt = '<';
569 $gt = '>';
570 }
571 // Remapping tag name?
572 if ($tags[$tagName]['remap']) {
573 $tagParts[0] = $tags[$tagName]['remap'];
574 }
575 // rmTagIfNoAttrib
576 if ($endTag || trim($tagParts[1]) || !$tags[$tagName]['rmTagIfNoAttrib']) {
577 $setTag = 1;
578 // Remove this closing tag if $tagName was among $TSconfig['removeTags']
579 if ($endTag && $tags[$tagName]['allowedAttribs'] === 0 && $tags[$tagName]['rmTagIfNoAttrib'] === 1) {
580 $setTag = 0;
581 }
582 if ($tags[$tagName]['nesting']) {
583 if (!is_array($tagRegister[$tagName])) {
584 $tagRegister[$tagName] = [];
585 }
586 if ($endTag) {
587 $correctTag = 1;
588 if ($tags[$tagName]['nesting'] === 'global') {
589 $lastEl = end($tagStack);
590 if ($tagName !== $lastEl) {
591 if (in_array($tagName, $tagStack)) {
592 while (!empty($tagStack) && $tagName !== $lastEl) {
593 $elPos = end($tagRegister[$lastEl]);
594 unset($newContent[$elPos]);
595 array_pop($tagRegister[$lastEl]);
596 array_pop($tagStack);
597 $lastEl = end($tagStack);
598 }
599 } else {
600 // In this case the
601 $correctTag = 0;
602 }
603 }
604 }
605 if (empty($tagRegister[$tagName]) || !$correctTag) {
606 $setTag = 0;
607 } else {
608 array_pop($tagRegister[$tagName]);
609 if ($tags[$tagName]['nesting'] === 'global') {
610 array_pop($tagStack);
611 }
612 }
613 } else {
614 $tagRegister[$tagName][] = $c;
615 if ($tags[$tagName]['nesting'] === 'global') {
616 $tagStack[] = $tagName;
617 }
618 }
619 }
620 if ($setTag) {
621 // Setting the tag
622 $newContent[$c++] = $lt . ($endTag ? '/' : '') . trim($tagParts[0] . ' ' . $tagParts[1]) . ($emptyTag ? ' /' : '') . $gt;
623 }
624 }
625 } else {
626 $newContent[$c++] = '<' . ($endTag ? '/' : '') . $tagContent . '>';
627 }
628 } elseif ($keepAll) {
629 // This is if the tag was not defined in the array for processing:
630 if ($keepAll === 'protect') {
631 $lt = '&lt;';
632 $gt = '&gt;';
633 } else {
634 $lt = '<';
635 $gt = '>';
636 }
637 $newContent[$c++] = $lt . ($endTag ? '/' : '') . $tagContent . $gt;
638 }
639 $newContent[$c++] = $this->bidir_htmlspecialchars(substr($tok, $tagEnd + 1), $hSC);
640 } else {
641 $newContent[$c++] = $this->bidir_htmlspecialchars('<' . $tok, $hSC);
642 }
643 } else {
644 $newContent[$c++] = $this->bidir_htmlspecialchars(($skipTag ? '' : '<') . $tok, $hSC);
645 // It was not a tag anyways
646 $skipTag = false;
647 }
648 }
649 // Unsetting tags:
650 foreach ($tagRegister as $tag => $positions) {
651 foreach ($positions as $pKey) {
652 unset($newContent[$pKey]);
653 }
654 }
655 $newContent = implode('', $newContent);
656 $newContent = $this->stripEmptyTagsIfConfigured($newContent, $addConfig);
657 return $newContent;
658 }
659
660 /**
661 * Converts htmlspecialchars forth ($dir=1) AND back ($dir=-1)
662 *
663 * @param string $value Input value
664 * @param int $dir Direction: forth ($dir=1, dir=2 for preserving entities) AND back ($dir=-1)
665 * @return string Output value
666 */
667 public function bidir_htmlspecialchars($value, $dir)
668 {
669 switch ((int)$dir) {
670 case 1:
671 return htmlspecialchars($value);
672 case 2:
673 return htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false);
674 case -1:
675 return htmlspecialchars_decode($value);
676 default:
677 return $value;
678 }
679 }
680
681 /**
682 * Prefixes the relative paths of hrefs/src/action in the tags [td,table,body,img,input,form,link,script,a] in the $content with the $main_prefix or and alternative given by $alternatives
683 *
684 * @param string $main_prefix Prefix string
685 * @param string $content HTML content
686 * @param array $alternatives Array with alternative prefixes for certain of the tags. key=>value pairs where the keys are the tag element names in uppercase
687 * @param string $suffix Suffix string (put after the resource).
688 * @return string Processed HTML content
689 */
690 public function prefixResourcePath($main_prefix, $content, $alternatives = [], $suffix = '')
691 {
692 $parts = $this->splitTags('embed,td,table,body,img,input,form,link,script,a,param', $content);
693 foreach ($parts as $k => $v) {
694 if ($k % 2) {
695 $params = $this->get_tag_attributes($v);
696 // Detect tag-ending so that it is re-applied correctly.
697 $tagEnd = substr($v, -2) === '/>' ? ' />' : '>';
698 // The 'name' of the first tag
699 $firstTagName = $this->getFirstTagName($v);
700 $somethingDone = 0;
701 $prefix = isset($alternatives[strtoupper($firstTagName)]) ? $alternatives[strtoupper($firstTagName)] : $main_prefix;
702 switch (strtolower($firstTagName)) {
703 case 'td':
704
705 case 'body':
706
707 case 'table':
708 $src = $params[0]['background'];
709 if ($src) {
710 $params[0]['background'] = $this->prefixRelPath($prefix, $params[0]['background'], $suffix);
711 $somethingDone = 1;
712 }
713 break;
714 case 'img':
715
716 case 'input':
717
718 case 'script':
719
720 case 'embed':
721 $src = $params[0]['src'];
722 if ($src) {
723 $params[0]['src'] = $this->prefixRelPath($prefix, $params[0]['src'], $suffix);
724 $somethingDone = 1;
725 }
726 break;
727 case 'link':
728
729 case 'a':
730 $src = $params[0]['href'];
731 if ($src) {
732 $params[0]['href'] = $this->prefixRelPath($prefix, $params[0]['href'], $suffix);
733 $somethingDone = 1;
734 }
735 break;
736 case 'form':
737 $src = $params[0]['action'];
738 if ($src) {
739 $params[0]['action'] = $this->prefixRelPath($prefix, $params[0]['action'], $suffix);
740 $somethingDone = 1;
741 }
742 break;
743 case 'param':
744 $test = $params[0]['name'];
745 if ($test && $test === 'movie') {
746 if ($params[0]['value']) {
747 $params[0]['value'] = $this->prefixRelPath($prefix, $params[0]['value'], $suffix);
748 $somethingDone = 1;
749 }
750 }
751 break;
752 }
753 if ($somethingDone) {
754 $tagParts = preg_split('/\\s+/s', $v, 2);
755 $tagParts[1] = $this->compileTagAttribs($params[0], $params[1]);
756 $parts[$k] = '<' . trim(strtolower($firstTagName) . ' ' . $tagParts[1]) . $tagEnd;
757 }
758 }
759 }
760 $content = implode('', $parts);
761 // Fix <style> section:
762 $prefix = isset($alternatives['style']) ? $alternatives['style'] : $main_prefix;
763 if ((string)$prefix !== '') {
764 $parts = $this->splitIntoBlock('style', $content);
765 foreach ($parts as $k => &$part) {
766 if ($k % 2) {
767 $part = preg_replace('/(url[[:space:]]*\\([[:space:]]*["\']?)([^"\')]*)(["\']?[[:space:]]*\\))/i', '\\1' . $prefix . '\\2' . $suffix . '\\3', $part);
768 }
769 }
770 unset($part);
771 $content = implode('', $parts);
772 }
773 return $content;
774 }
775
776 /**
777 * Internal sub-function for ->prefixResourcePath()
778 *
779 * @param string $prefix Prefix string
780 * @param string $srcVal Relative path/URL
781 * @param string $suffix Suffix string
782 * @return string Output path, prefixed if no scheme in input string
783 * @access private
784 */
785 public function prefixRelPath($prefix, $srcVal, $suffix = '')
786 {
787 // Only prefix if it's not an absolute URL or
788 // only a link to a section within the page.
789 if ($srcVal[0] !== '/' && $srcVal[0] !== '#') {
790 $urlParts = parse_url($srcVal);
791 // Only prefix URLs without a scheme
792 if (!$urlParts['scheme']) {
793 $srcVal = $prefix . $srcVal . $suffix;
794 }
795 }
796 return $srcVal;
797 }
798
799 /**
800 * Internal function for case shifting of a string or whole array
801 *
802 * @param mixed $str Input string/array
803 * @param bool $caseSensitiveComparison If this value is FALSE, the string is returned in uppercase
804 * @param string $cacheKey Key string used for internal caching of the results. Could be an MD5 hash of the serialized version of the input $str if that is an array.
805 * @return string Output string, processed
806 * @access private
807 */
808 public function caseShift($str, $caseSensitiveComparison, $cacheKey = '')
809 {
810 if ($caseSensitiveComparison) {
811 return $str;
812 }
813 if (is_array($str)) {
814 // Fetch from runlevel cache
815 if ($cacheKey && isset($this->caseShift_cache[$cacheKey])) {
816 $str = $this->caseShift_cache[$cacheKey];
817 } else {
818 array_walk($str, function (&$value) {
819 $value = strtoupper($value);
820 });
821 if ($cacheKey) {
822 $this->caseShift_cache[$cacheKey] = $str;
823 }
824 }
825 } else {
826 $str = strtoupper($str);
827 }
828 return $str;
829 }
830
831 /**
832 * Compiling an array with tag attributes into a string
833 *
834 * @param array $tagAttrib Tag attributes
835 * @param array $meta Meta information about these attributes (like if they were quoted)
836 * @return string Imploded attributes, eg: 'attribute="value" attrib2="value2"'
837 * @access private
838 */
839 public function compileTagAttribs($tagAttrib, $meta = [])
840 {
841 $accu = [];
842 foreach ($tagAttrib as $k => $v) {
843 $attr = $meta[$k]['origTag'] ?: $k;
844 if (strcmp($v, '') || isset($meta[$k]['dashType'])) {
845 $dash = $meta[$k]['dashType'] ?: (MathUtility::canBeInterpretedAsInteger($v) ? '' : '"');
846 $attr .= '=' . $dash . $v . $dash;
847 }
848 $accu[] = $attr;
849 }
850 return implode(' ', $accu);
851 }
852
853 /**
854 * Converts TSconfig into an array for the HTMLcleaner function.
855 *
856 * @param array $TSconfig TSconfig for HTMLcleaner
857 * @param array $keepTags Array of tags to keep (?)
858 * @return array
859 * @access private
860 */
861 public function HTMLparserConfig($TSconfig, $keepTags = [])
862 {
863 // Allow tags (base list, merged with incoming array)
864 $alTags = array_flip(GeneralUtility::trimExplode(',', strtolower($TSconfig['allowTags']), true));
865 $keepTags = array_merge($alTags, $keepTags);
866 // Set config properties.
867 if (is_array($TSconfig['tags.'])) {
868 foreach ($TSconfig['tags.'] as $key => $tagC) {
869 if (!is_array($tagC) && $key == strtolower($key)) {
870 if ((string)$tagC === '0') {
871 unset($keepTags[$key]);
872 }
873 if ((string)$tagC === '1' && !isset($keepTags[$key])) {
874 $keepTags[$key] = 1;
875 }
876 }
877 }
878 foreach ($TSconfig['tags.'] as $key => $tagC) {
879 if (is_array($tagC) && $key == strtolower($key)) {
880 $key = substr($key, 0, -1);
881 if (!is_array($keepTags[$key])) {
882 $keepTags[$key] = [];
883 }
884 if (is_array($tagC['fixAttrib.'])) {
885 foreach ($tagC['fixAttrib.'] as $atName => $atConfig) {
886 if (is_array($atConfig)) {
887 $atName = substr($atName, 0, -1);
888 if (!is_array($keepTags[$key]['fixAttrib'][$atName])) {
889 $keepTags[$key]['fixAttrib'][$atName] = [];
890 }
891 $keepTags[$key]['fixAttrib'][$atName] = array_merge($keepTags[$key]['fixAttrib'][$atName], $atConfig);
892 if ((string)$keepTags[$key]['fixAttrib'][$atName]['range'] !== '') {
893 $keepTags[$key]['fixAttrib'][$atName]['range'] = GeneralUtility::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['range']);
894 }
895 if ((string)$keepTags[$key]['fixAttrib'][$atName]['list'] !== '') {
896 $keepTags[$key]['fixAttrib'][$atName]['list'] = GeneralUtility::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['list']);
897 }
898 }
899 }
900 }
901 unset($tagC['fixAttrib.']);
902 unset($tagC['fixAttrib']);
903 if (isset($tagC['rmTagIfNoAttrib']) && $tagC['rmTagIfNoAttrib'] && empty($tagC['nesting'])) {
904 $tagC['nesting'] = 1;
905 }
906 $keepTags[$key] = array_merge($keepTags[$key], $tagC);
907 }
908 }
909 }
910 // LocalNesting
911 if ($TSconfig['localNesting']) {
912 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['localNesting']), true);
913 foreach ($lN as $tn) {
914 if (isset($keepTags[$tn])) {
915 if (!is_array($keepTags[$tn])) {
916 $keepTags[$tn] = [];
917 }
918 $keepTags[$tn]['nesting'] = 1;
919 }
920 }
921 }
922 if ($TSconfig['globalNesting']) {
923 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['globalNesting']), true);
924 foreach ($lN as $tn) {
925 if (isset($keepTags[$tn])) {
926 if (!is_array($keepTags[$tn])) {
927 $keepTags[$tn] = [];
928 }
929 $keepTags[$tn]['nesting'] = 'global';
930 }
931 }
932 }
933 if ($TSconfig['rmTagIfNoAttrib']) {
934 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['rmTagIfNoAttrib']), true);
935 foreach ($lN as $tn) {
936 if (isset($keepTags[$tn])) {
937 if (!is_array($keepTags[$tn])) {
938 $keepTags[$tn] = [];
939 }
940 $keepTags[$tn]['rmTagIfNoAttrib'] = 1;
941 if (empty($keepTags[$tn]['nesting'])) {
942 $keepTags[$tn]['nesting'] = 1;
943 }
944 }
945 }
946 }
947 if ($TSconfig['noAttrib']) {
948 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['noAttrib']), true);
949 foreach ($lN as $tn) {
950 if (isset($keepTags[$tn])) {
951 if (!is_array($keepTags[$tn])) {
952 $keepTags[$tn] = [];
953 }
954 $keepTags[$tn]['allowedAttribs'] = 0;
955 }
956 }
957 }
958 if ($TSconfig['removeTags']) {
959 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['removeTags']), true);
960 foreach ($lN as $tn) {
961 $keepTags[$tn] = [];
962 $keepTags[$tn]['allowedAttribs'] = 0;
963 $keepTags[$tn]['rmTagIfNoAttrib'] = 1;
964 }
965 }
966 // Create additional configuration:
967 $addConfig = [];
968 if (isset($TSconfig['stripEmptyTags'])) {
969 $addConfig['stripEmptyTags'] = $TSconfig['stripEmptyTags'];
970 if (isset($TSconfig['stripEmptyTags.'])) {
971 $addConfig['stripEmptyTags.'] = $TSconfig['stripEmptyTags.'];
972 }
973 }
974 return [
975 $keepTags,
976 '' . $TSconfig['keepNonMatchedTags'],
977 (int)$TSconfig['htmlSpecialChars'],
978 $addConfig
979 ];
980 }
981
982 /**
983 * Strips empty tags from HTML.
984 *
985 * @param string $content The content to be stripped of empty tags
986 * @param string $tagList The comma separated list of tags to be stripped.
987 * If empty, all empty tags will be stripped
988 * @param bool $treatNonBreakingSpaceAsEmpty If TRUE tags containing only &nbsp; entities will be treated as empty.
989 * @param bool $keepTags If true, the provided tags will be kept instead of stripped.
990 * @return string the stripped content
991 */
992 public function stripEmptyTags($content, $tagList = '', $treatNonBreakingSpaceAsEmpty = false, $keepTags = false)
993 {
994 if (!empty($tagList)) {
995 $tagRegEx = implode('|', GeneralUtility::trimExplode(',', $tagList, true));
996 if ($keepTags) {
997 $tagRegEx = '(?!' . $tagRegEx . ')[^ >]+';
998 }
999 } else {
1000 $tagRegEx = '[^ >]+'; // all characters until you reach a > or space;
1001 }
1002 $count = 1;
1003 $nbspRegex = $treatNonBreakingSpaceAsEmpty ? '|(&nbsp;)' : '';
1004 $finalRegex = sprintf('/<(%s)[^>]*>( %s)*<\/\\1[^>]*>/i', $tagRegEx, $nbspRegex);
1005 while ($count !== 0) {
1006 $content = preg_replace($finalRegex, '', $content, -1, $count);
1007 }
1008 return $content;
1009 }
1010
1011 /**
1012 * Strips the configured empty tags from the HMTL code.
1013 *
1014 * @param string $value
1015 * @param array $configuration
1016 * @return string
1017 */
1018 protected function stripEmptyTagsIfConfigured($value, $configuration)
1019 {
1020 if (empty($configuration['stripEmptyTags'])) {
1021 return $value;
1022 }
1023
1024 $tags = null;
1025 $keepTags = false;
1026 if (!empty($configuration['stripEmptyTags.']['keepTags'])) {
1027 $tags = $configuration['stripEmptyTags.']['keepTags'];
1028 $keepTags = true;
1029 } elseif (!empty($configuration['stripEmptyTags.']['tags'])) {
1030 $tags = $configuration['stripEmptyTags.']['tags'];
1031 }
1032
1033 $treatNonBreakingSpaceAsEmpty = !empty($configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty']);
1034
1035 return $this->stripEmptyTags($value, $tags, $treatNonBreakingSpaceAsEmpty, $keepTags);
1036 }
1037 }