[FEATURE] Keep tags when stripping empty tags in HtmlParser
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Tests / Unit / Html / HtmlParserTest.php
1 <?php
2 namespace TYPO3\CMS\Core\Tests\Unit\Html;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Html\HtmlParser;
18
19 /**
20 * Testcase for \TYPO3\CMS\Core\Html\HtmlParser
21 */
22 class HtmlParserTest extends \TYPO3\CMS\Core\Tests\UnitTestCase
23 {
24 /**
25 * @var \TYPO3\CMS\Core\Html\HtmlParser
26 */
27 protected $subject = null;
28
29 protected function setUp()
30 {
31 $this->subject = new HtmlParser();
32 }
33
34 /**
35 * @return array
36 */
37 public function cDataWillRemainUnmodifiedDataProvider()
38 {
39 return array(
40 'single-line CDATA' => array(
41 '/*<![CDATA[*/ <hello world> /*]]>*/',
42 '/*<![CDATA[*/ <hello world> /*]]>*/',
43 ),
44 'multi-line CDATA #1' => array(
45 '/*<![CDATA[*/' . LF . '<hello world> /*]]>*/',
46 '/*<![CDATA[*/' . LF . '<hello world> /*]]>*/',
47 ),
48 'multi-line CDATA #2' => array(
49 '/*<![CDATA[*/ <hello world>' . LF . '/*]]>*/',
50 '/*<![CDATA[*/ <hello world>' . LF . '/*]]>*/',
51 ),
52 'multi-line CDATA #3' => array(
53 '/*<![CDATA[*/' . LF . '<hello world>' . LF . '/*]]>*/',
54 '/*<![CDATA[*/' . LF . '<hello world>' . LF . '/*]]>*/',
55 ),
56 );
57 }
58
59 /**
60 * Data provider for splitIntoBlock
61 *
62 * @return array
63 */
64 public function splitIntoBlockDataProvider()
65 {
66 return array(
67 'splitBlock' => array(
68 'h1,span',
69 '<body><h1>Title</h1><span>Note</span></body>',
70 false,
71 array('<body>',
72 '<h1>Title</h1>',
73 '',
74 '<span>Note</span>',
75 '</body>')
76 ),
77 'splitBlock br' => array(
78 'h1,span',
79 '<body><h1>Title</h1><br /><span>Note</span><br /></body>',
80 false,
81 array('<body>',
82 '<h1>Title</h1>',
83 '<br />',
84 '<span>Note</span>',
85 '<br /></body>')
86 ),
87 'splitBlock with attribute' => array(
88 'h1,span',
89 '<body><h1 class="title">Title</h1><span>Note</span></body>',
90 false,
91 array('<body>',
92 '<h1 class="title">Title</h1>',
93 '',
94 '<span>Note</span>',
95 '</body>')
96 ),
97 'splitBlock span with attribute' => array(
98 'span',
99 '<body><h1>Title</h1><span class="title">Note</span></body>',
100 false,
101 array('<body><h1>Title</h1>',
102 '<span class="title">Note</span>',
103 '</body>')
104 ),
105 'splitBlock without extra end tags' => array(
106 'h1,span,div',
107 '<body><h1>Title</h1><span>Note</span></body></div>',
108 true,
109 array('<body>',
110 '<h1>Title</h1>',
111 '',
112 '<span>Note</span>',
113 '</body>')
114 ),
115 );
116 }
117
118 /**
119 * @test
120 * @param string $tag List of tags, comma separated.
121 * @param string $content HTML-content
122 * @param bool $eliminateExtraEndTags If set, excessive end tags are ignored - you should probably set this in most cases.
123 * @param array $expected The expected result
124 * @dataProvider splitIntoBlockDataProvider
125 */
126 public function splitIntoBlock($tag, $content, $eliminateExtraEndTags, $expected)
127 {
128 $this->assertSame($expected, $this->subject->splitIntoBlock($tag, $content, $eliminateExtraEndTags));
129 }
130
131 /**
132 * @test
133 * @param string $source
134 * @param string $expected
135 * @dataProvider cDataWillRemainUnmodifiedDataProvider
136 */
137 public function xHtmlCleaningDoesNotModifyCDATA($source, $expected)
138 {
139 $result = $this->subject->HTMLcleaner($source, array(), 1);
140 $this->assertSame($expected, $result);
141 }
142
143 /**
144 * Data provider for spanTagCorrectlyRemovedWhenRmTagIfNoAttribIsConfigured
145 */
146 public static function spanTagCorrectlyRemovedWhenRmTagIfNoAttribIsConfiguredDataProvider()
147 {
148 return array(
149 'Span tag with no attrib' => array(
150 '<span>text</span>',
151 'text'
152 ),
153 'Span tag with allowed id attrib' => array(
154 '<span id="id">text</span>',
155 '<span id="id">text</span>'
156 ),
157 'Span tag with disallowed style attrib' => array(
158 '<span style="line-height: 12px;">text</span>',
159 'text'
160 )
161 );
162 }
163
164 /**
165 * @test
166 * @param string $content
167 * @param string $expectedResult
168 * @dataProvider spanTagCorrectlyRemovedWhenRmTagIfNoAttribIsConfiguredDataProvider
169 */
170 public function tagCorrectlyRemovedWhenRmTagIfNoAttribIsConfigured($content, $expectedResult)
171 {
172 $tsConfig = array(
173 'allowTags' => 'span',
174 'tags.' => array(
175 'span.' => array(
176 'allowedAttribs' => 'id',
177 'rmTagIfNoAttrib' => 1
178 )
179 )
180 );
181 $this->assertEquals($expectedResult, $this->parseConfigAndCleanHtml($tsConfig, $content));
182 }
183
184 /**
185 * @test
186 */
187 public function rmTagIfNoAttribIsConfiguredDoesNotChangeNestingType()
188 {
189 $tsConfig = array(
190 'allowTags' => 'div,span',
191 'rmTagIfNoAttrib' => 'span',
192 'globalNesting' => 'div,span'
193 );
194 $content = '<span></span><span id="test"><div></span></div>';
195 $expectedResult = '<span id="test"></span>';
196 $this->assertEquals($expectedResult, $this->parseConfigAndCleanHtml($tsConfig, $content));
197 }
198
199 /**
200 * Data provider for localNestingCorrectlyRemovesInvalidTags
201 *
202 * @return array
203 */
204 public static function localNestingCorrectlyRemovesInvalidTagsDataProvider()
205 {
206 return array(
207 'Valid nesting is untouched' => array(
208 '<B><I></B></I>',
209 '<B><I></B></I>'
210 ),
211 'Valid nesting with content is untouched' => array(
212 'testa<B>test1<I>test2</B>test3</I>testb',
213 'testa<B>test1<I>test2</B>test3</I>testb'
214 ),
215 'Superflous tags are removed' => array(
216 '</B><B><I></B></I></B>',
217 '<B><I></B></I>'
218 ),
219 'Superflous tags with content are removed' => array(
220 'test1</B>test2<B>test3<I>test4</B>test5</I>test6</B>test7',
221 'test1test2<B>test3<I>test4</B>test5</I>test6test7'
222 ),
223 'Another valid nesting test' => array(
224 '<span><div></span></div>',
225 '<span><div></span></div>',
226 ),
227 );
228 }
229
230 /**
231 * @test
232 * @dataProvider localNestingCorrectlyRemovesInvalidTagsDataProvider
233 * @param string $content
234 * @param string $expectedResult
235 */
236 public function localNestingCorrectlyRemovesInvalidTags($content, $expectedResult)
237 {
238 $tsConfig = array(
239 'allowTags' => 'div,span,b,i',
240 'localNesting' => 'div,span,b,i',
241 );
242 $this->assertEquals($expectedResult, $this->parseConfigAndCleanHtml($tsConfig, $content));
243 }
244
245 /**
246 * Data provider for globalNestingCorrectlyRemovesInvalidTags
247 *
248 * @return array
249 */
250 public static function globalNestingCorrectlyRemovesInvalidTagsDataProvider()
251 {
252 return array(
253 'Valid nesting is untouched' => array(
254 '<B><I></I></B>',
255 '<B><I></I></B>'
256 ),
257 'Valid nesting with content is untouched' => array(
258 'testa<B>test1<I>test2</I>test3</B>testb',
259 'testa<B>test1<I>test2</I>test3</B>testb'
260 ),
261 'Invalid nesting is cleaned' => array(
262 '</B><B><I></B></I></B>',
263 '<B></B>'
264 ),
265 'Invalid nesting with content is cleaned' => array(
266 'test1</B>test2<B>test3<I>test4</B>test5</I>test6</B>test7',
267 'test1test2<B>test3test4</B>test5test6test7'
268 ),
269 'Another invalid nesting test' => array(
270 '<span><div></span></div>',
271 '<span></span>',
272 ),
273 );
274 }
275
276 /**
277 * @test
278 * @dataProvider globalNestingCorrectlyRemovesInvalidTagsDataProvider
279 * @param string $content
280 * @param string $expectedResult
281 */
282 public function globalNestingCorrectlyRemovesInvalidTags($content, $expectedResult)
283 {
284 $tsConfig = array(
285 'allowTags' => 'span,div,b,i',
286 'globalNesting' => 'span,div,b,i',
287 );
288 $this->assertEquals($expectedResult, $this->parseConfigAndCleanHtml($tsConfig, $content));
289 }
290
291 /**
292 * @return array
293 */
294 public function emptyTagsDataProvider()
295 {
296 return array(
297 array(0, null, false, '<h1></h1>', '<h1></h1>'),
298 array(1, null, false, '<h1></h1>', ''),
299 array(1, null, false, '<h1>hallo</h1>', '<h1>hallo</h1>'),
300 array(1, null, false, '<h1 class="something"></h1>', ''),
301 array(1, null, false, '<h1 class="something"></h1><h2></h2>', ''),
302 array(1, 'h2', false, '<h1 class="something"></h1><h2></h2>', '<h1 class="something"></h1>'),
303 array(1, 'h2, h1', false, '<h1 class="something"></h1><h2></h2>', ''),
304 array(1, null, false, '<div><p></p></div>', ''),
305 array(1, null, false, '<div><p>&nbsp;</p></div>', '<div><p>&nbsp;</p></div>'),
306 array(1, null, true, '<div><p>&nbsp;&nbsp;</p></div>', ''),
307 array(1, null, true, '<div>&nbsp;&nbsp;<p></p></div>', ''),
308 array(1, null, false, '<div>Some content<p></p></div>', '<div>Some content</div>'),
309 array(1, null, true, '<div>Some content<p></p></div>', '<div>Some content</div>'),
310 array(1, null, false, '<div>Some content</div>', '<div>Some content</div>'),
311 array(1, null, true, '<div>Some content</div>', '<div>Some content</div>'),
312 array(1, null, false, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'),
313 array(1, null, true, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'),
314 array(0, '', false, '<h1></h1>', '<h1></h1>'),
315 array(1, '', false, '<h1></h1>', ''),
316 array(1, '', false, '<h1>hallo</h1>', '<h1>hallo</h1>'),
317 array(1, '', false, '<h1 class="something"></h1>', ''),
318 array(1, '', false, '<h1 class="something"></h1><h2></h2>', ''),
319 array(1, '', false, '<div><p></p></div>', ''),
320 array(1, '', false, '<div><p>&nbsp;</p></div>', '<div><p>&nbsp;</p></div>'),
321 array(1, '', true, '<div><p>&nbsp;&nbsp;</p></div>', ''),
322 array(1, '', true, '<div>&nbsp;&nbsp;<p></p></div>', ''),
323 array(1, '', false, '<div>Some content<p></p></div>', '<div>Some content</div>'),
324 array(1, '', true, '<div>Some content<p></p></div>', '<div>Some content</div>'),
325 array(1, '', false, '<div>Some content</div>', '<div>Some content</div>'),
326 array(1, '', true, '<div>Some content</div>', '<div>Some content</div>'),
327 array(1, '', false, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'),
328 array(1, '', true, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'),
329 );
330 }
331
332 /**
333 * @test
334 * @dataProvider emptyTagsDataProvider
335 * @param bool $stripOn TRUE if stripping should be activated.
336 * @param string $tagList Comma seperated list of tags that should be stripped.
337 * @param bool $treatNonBreakingSpaceAsEmpty If TRUE &nbsp; will be considered empty.
338 * @param string $content The HTML code that should be modified.
339 * @param string $expectedResult The expected HTML code result.
340 */
341 public function stripEmptyTags($stripOn, $tagList, $treatNonBreakingSpaceAsEmpty, $content, $expectedResult)
342 {
343 $tsConfig = array(
344 'keepNonMatchedTags' => 1,
345 'stripEmptyTags' => $stripOn,
346 'stripEmptyTags.' => array(
347 'tags' => $tagList,
348 'treatNonBreakingSpaceAsEmpty' => $treatNonBreakingSpaceAsEmpty
349 ),
350 );
351
352 $result = $this->parseConfigAndCleanHtml($tsConfig, $content);
353 $this->assertEquals($expectedResult, $result);
354 }
355
356 /**
357 * @return array
358 */
359 public function stripEmptyTagsKeepsConfiguredTagsDataProvider() {
360 return [
361 array(
362 'tr,td',
363 false,
364 '<div><p><tr><td></td></tr></p></div><div class="test"></div><tr></tr><p></p><td></td><i></i>',
365 '<div><p><tr><td></td></tr></p></div><tr></tr><td></td>'
366 ),
367 array(
368 'tr,td',
369 true,
370 '<div><p><tr><td></td></tr></p></div><p class="test"> &nbsp; </p><tr></tr><p></p><td></td><i></i>',
371 '<div><p><tr><td></td></tr></p></div><tr></tr><td></td>'
372 ),
373 ];
374 }
375
376 /**
377 * @test
378 * @dataProvider stripEmptyTagsKeepsConfiguredTagsDataProvider
379 * @param string $tagList List of tags that should be kept, event if they are empty.
380 * @param bool $treatNonBreakingSpaceAsEmpty If true &nbsp; will be considered empty.
381 * @param string $content The HTML content that should be parsed.
382 * @param string $expectedResult The expected HTML code result.
383 */
384 public function stripEmptyTagsKeepsConfiguredTags($tagList, $treatNonBreakingSpaceAsEmpty, $content, $expectedResult) {
385 $tsConfig = array(
386 'keepNonMatchedTags' => 1,
387 'stripEmptyTags' => 1,
388 'stripEmptyTags.' => array(
389 'keepTags' => $tagList,
390 'treatNonBreakingSpaceAsEmpty' => $treatNonBreakingSpaceAsEmpty
391 ),
392 );
393
394 $result = $this->parseConfigAndCleanHtml($tsConfig, $content);
395 $this->assertEquals($expectedResult, $result);
396 }
397
398 /**
399 * Calls HTMLparserConfig() and passes the generated config to the HTMLcleaner() method on the current subject.
400 *
401 * @param array $tsConfig The TypoScript that should be used to generate the HTML parser config.
402 * @param string $content The content that should be parsed by the HTMLcleaner.
403 * @return string The parsed content.
404 */
405 protected function parseConfigAndCleanHtml(array $tsConfig, $content)
406 {
407 $config = $this->subject->HTMLparserConfig($tsConfig);
408 return $this->subject->HTMLcleaner($content, $config[0], $config[1], $config[2], $config[3]);
409 }
410
411 /**
412 * Data provider for getFirstTag
413 *
414 * @return array
415 */
416 public function getFirstTagDataProvider()
417 {
418 return array(
419 array('<body><span></span></body>', '<body>'),
420 array('<span>Wrapper<div>Some content</div></span>', '<span>'),
421 array('Something before<span>Wrapper<div>Some content</div></span>Something after', 'Something before<span>'),
422 array('Something without tag', '')
423 );
424 }
425
426 /**
427 * Returns the first tag in $str
428 * Actually everything from the beginning of the $str is returned, so you better make sure the tag is the first thing...
429 *
430 * @test
431 * @dataProvider getFirstTagDataProvider
432 *
433 * @param string $str HTML string with tags
434 * @param string $expected The expected result.
435 */
436 public function getFirstTag($str, $expected)
437 {
438 $this->assertEquals($expected, $this->subject->getFirstTag($str));
439 }
440
441 /**
442 * Data provider for getFirstTagName
443 *
444 * @return array
445 */
446 public function getFirstTagNameDataProvider()
447 {
448 return array(
449 array('<body><span></span></body>',
450 false,
451 'BODY'),
452 array('<body><span></span></body>',
453 true,
454 'body'),
455 array('<div class="test"><span></span></div>',
456 false,
457 'DIV'),
458 array('<div><span class="test"></span></div>',
459 false,
460 'DIV'),
461 array('<br /><span class="test"></span>',
462 false,
463 'BR'),
464 array('<img src="test.jpg" />',
465 false,
466 'IMG'),
467 );
468 }
469
470 /**
471 * Returns the NAME of the first tag in $str
472 *
473 * @test
474 * @dataProvider getFirstTagNameDataProvider
475 *
476 * @param string $str HTML tag (The element name MUST be separated from the attributes by a space character! Just *whitespace* will not do)
477 * @param bool $preserveCase If set, then the tag is NOT converted to uppercase by case is preserved.
478 * @param string $expected The expected result.
479 */
480 public function getFirstTagName($str, $preserveCase, $expected)
481 {
482 $this->assertEquals($expected, $this->subject->getFirstTagName($str, $preserveCase));
483 }
484
485 /**
486 * @return array
487 */
488 public function removeFirstAndLastTagDataProvider()
489 {
490 return array(
491 array('<span>Wrapper<div>Some content</div></span>', 'Wrapper<div>Some content</div>'),
492 array('<td><tr>Some content</tr></td>', '<tr>Some content</tr>'),
493 array('Something before<span>Wrapper<div>Some content</div></span>Something after', 'Wrapper<div>Some content</div>'),
494 array('<span class="hidden">Wrapper<div>Some content</div></span>', 'Wrapper<div>Some content</div>'),
495 array('<span>Wrapper<div class="hidden">Some content</div></span>', 'Wrapper<div class="hidden">Some content</div>'),
496 array('Some stuff before <span>Wrapper<div class="hidden">Some content</div></span> and after', 'Wrapper<div class="hidden">Some content</div>'),
497 );
498 }
499
500 /**
501 * Removes the first and last tag in the string
502 * Anything before the first and after the last tags respectively is also removed
503 *
504 * @test
505 * @dataProvider removeFirstAndLastTagDataProvider
506 * @param string $str String to process
507 * @param string $expectedResult
508 */
509 public function removeFirstAndLastTag($str, $expectedResult)
510 {
511 $this->assertEquals($expectedResult, $this->subject->removeFirstAndLastTag($str));
512 }
513
514 /**
515 * @return array
516 */
517 public function getTagAttributesDataProvider()
518 {
519 return [
520 [
521 '<a href="" data-shortCut="DXB" required>',
522 [
523 ['href' => '', 'data-shortcut' => 'DXB', 'required' => ''],
524 ['href' => ['origTag' => 'href', 'dashType' => '"'], 'data-shortcut' => ['origTag' => 'data-shortCut', 'dashType' => '"'], 'required' => ['origTag' => 'required']]
525 ]
526 ],
527 [
528 '<ul STYLE=\'background-image: (url: "fra.png")\' data-shortcut=FRA>',
529 [
530 ['style' => 'background-image: (url: "fra.png")', 'data-shortcut' => 'FRA'],
531 ['style' => ['origTag' => 'STYLE', 'dashType' => '\''], 'data-shortcut' => ['origTag' => 'data-shortcut', 'dashType' => '']]
532 ]
533 ]
534
535 ];
536 }
537
538 /**
539 * Returns an array with all attributes and its meta information from a tag.
540 * Removes tag-name if found
541 *
542 * @test
543 * @dataProvider getTagAttributesDataProvider
544 * @param string $tag String to process
545 * @param array $expectedResult
546 */
547 public function getTagAttributes($tag, $expectedResult)
548 {
549 $this->assertEquals($expectedResult, $this->subject->get_tag_attributes($tag));
550 }
551
552 /**
553 * @return array
554 */
555 public function stripEmptyTagsDataProvider()
556 {
557 return [
558 // Testing wrongly encapsulated and upper/lowercase tags
559 [
560 '<div>Denpassar</div><p> Bali</P><p></p><P></p><ul><li></li></ul>',
561 '',
562 false,
563 '<div>Denpassar</div><p> Bali</P>'
564 ],
565 // Testing incomplete tags
566 [
567 '<p><div>Klungklung</div></p><p> Semarapura<p></p><p></p><ul><li></li></ul>',
568 '',
569 false,
570 '<p><div>Klungklung</div></p><p> Semarapura'
571 ],
572 // Testing third parameter (break spaces
573 [
574 '<p><div>Badung</div></p><ul> Mangupura<p></p><p></p><ul><li>&nbsp;</li><li>Uluwatu</li></ul>',
575 '',
576 true,
577 '<p><div>Badung</div></p><ul> Mangupura<ul><li>Uluwatu</li></ul>'
578 ],
579 // Testing fourth parameter (keeping empty other tags, keeping defined used tags)
580 [
581 '<p><div>Badung</div></p><ul> Mangupura<p></p><p></p><ul><li></li></ul>',
582 'p,div',
583 true,
584 '<p><div>Badung</div></p><ul> Mangupura<ul><li></li></ul>'
585 ],
586
587 ];
588 }
589
590 /**
591 * Strips empty tags from HTML.
592 *
593 * @test
594 * @dataProvider stripEmptyTagsDataProvider
595 * @param string $content The content to be stripped of empty tags
596 * @param string $tagList The comma separated list of tags to be stripped.
597 * If empty, all empty tags will be stripped
598 * @param bool $treatNonBreakingSpaceAsEmpty If TRUE tags containing only &nbsp; entities will be treated as empty.
599 * @param string $expectedResult
600 */
601 public function rawStripEmptyTagsTest($content, $tagList, $treatNonBreakingSpaceAsEmpty, $expectedResult)
602 {
603 $this->assertEquals($expectedResult, $this->subject->stripEmptyTags($content, $tagList, $treatNonBreakingSpaceAsEmpty));
604 }
605 }