[FEATURE] Allow stripping of empty tags in HtmlParser 75/16975/19
authorAlexander Stehlik <alexander.stehlik@gmail.com>
Mon, 2 Mar 2015 19:07:34 +0000 (20:07 +0100)
committerAnja Leichsenring <aleichsenring@ab-softlab.de>
Mon, 2 Mar 2015 19:15:39 +0000 (20:15 +0100)
Add a stripEmptyTags method to the HtmlParser
It can be enabled by TypoScript or TSConfig:

HTMLparser.stripEmptyTags = 1
HTMLparser.stripEmptyTags.tags = h2, h3
HTMLparser.stripEmptyTags.treatNonBreakingSpaceAsEmpty = 1

Resolves: #20555
Releases: master
Change-Id: I640486e9f32da6ac1eba05e3c38d15a0aba41055
Reviewed-on: http://review.typo3.org/16975
Reviewed-by: Frank Nägler <typo3@naegler.net>
Tested-by: Frank Nägler <typo3@naegler.net>
Reviewed-by: Anja Leichsenring <aleichsenring@ab-softlab.de>
Tested-by: Anja Leichsenring <aleichsenring@ab-softlab.de>
typo3/sysext/core/Classes/Html/HtmlParser.php
typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst [new file with mode: 0644]
typo3/sysext/core/Tests/Unit/Html/HtmlParserTest.php

index 7bcc385..150a9b0 100644 (file)
@@ -982,7 +982,9 @@ class HtmlParser {
                                unset($newContent[$pKey]);
                        }
                }
-               return implode('', $newContent);
+               $newContent = implode('', $newContent);
+               $newContent = $this->stripEmptyTagsIfConfigured($newContent, $addConfig);
+               return $newContent;
        }
 
        /**
@@ -1410,6 +1412,12 @@ class HtmlParser {
                if ($TSconfig['xhtml_cleaning']) {
                        $addConfig['xhtml'] = 1;
                }
+               if (isset($TSconfig['stripEmptyTags'])) {
+                       $addConfig['stripEmptyTags'] = $TSconfig['stripEmptyTags'];
+                       if (isset($TSconfig['stripEmptyTags.'])) {
+                               $addConfig['stripEmptyTags.'] = $TSconfig['stripEmptyTags.'];
+                       }
+               }
                return array(
                        $keepTags,
                        '' . $TSconfig['keepNonMatchedTags'],
@@ -1523,4 +1531,60 @@ class HtmlParser {
                return $value;
        }
 
+       /**
+        * Strips empty tags from HTML.
+        *
+        * @param string $content The content to be stripped of empty tags
+        * @param string $tagList The comma separated list of tags to be stripped.
+        *                        If empty, all empty tags will be stripped
+        * @param bool $treatNonBreakingSpaceAsEmpty If TRUE tags containing only &nbsp; entities will be treated as empty.
+        * @return string the stripped content
+        */
+       public function stripEmptyTags($content, $tagList = NULL, $treatNonBreakingSpaceAsEmpty = FALSE) {
+               $tagRegEx = '[^ >]+'; // all characters until you reach a > or space;
+               if ($tagList) {
+                       $tags = preg_split('/,/', $tagList);
+                       $tagRegEx = preg_replace('/ */', '', join('|', $tags));
+               }
+               $count = 1;
+               $nbspRegex = $treatNonBreakingSpaceAsEmpty ? '|(&nbsp;)' : '';
+               while ($count != 0) {
+                       $content = preg_replace(sprintf('/<(%s)[^>]*>( %s)*<\/\\1[^>]*>/i', $tagRegEx, $nbspRegex), '', $content, -1, $count);
+               }
+               return $content;
+       }
+
+       /**
+        * Strips the configured empty tags from the HMTL code.
+        *
+        * @param string $value
+        * @param array $configuration
+        * @return string
+        */
+       protected function stripEmptyTagsIfConfigured($value, $configuration) {
+
+               if (isset($configuration['stripEmptyTags']) && $configuration['stripEmptyTags']) {
+
+                       $tags = NULL;
+                       if (
+                               isset($configuration['stripEmptyTags.']['tags'])
+                               && $configuration['stripEmptyTags.']['tags'] !== ''
+                       ) {
+                               $tags = $configuration['stripEmptyTags.']['tags'];
+                       }
+
+                       $treatNonBreakingSpaceAsEmpty = FALSE;
+                       if (
+                               isset($configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty'])
+                               && $configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty']
+                       ) {
+                               $treatNonBreakingSpaceAsEmpty = (bool)$configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty'];
+                       }
+
+
+                       $value = $this->stripEmptyTags($value, $tags, $treatNonBreakingSpaceAsEmpty);
+               }
+
+               return $value;
+       }
 }
diff --git a/typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst b/typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst
new file mode 100644 (file)
index 0000000..5c6e151
--- /dev/null
@@ -0,0 +1,60 @@
+=====================================================
+Feature: #20555 - Strip empty HTML tags in HtmlParser
+=====================================================
+
+Description
+===========
+
+A new functionality is introduced in the HtmlParser that allows the stripping of empty HTML tags.
+
+It can be used in the Frontend by using the :ref:`HTMLparser<t3tsref:htmlparser>` TypoScript
+configuration of :ref:`stdWrap<t3tsref:stdwrap-htmlparser>`:
+
+.. code-block:: typoscript
+
+       stdWrap {
+
+               // If this is set all empty tags are stripped, unless a list of tags is provided below.
+               HTMLparser.stripEmptyTags = 1
+
+               // This setting can be used to filter the tags that should be stripped if they are empty.
+               HTMLparser.stripEmptyTags.tags = h2, h3
+       }
+
+It is also possible to use it in the
+:ref:`HTMLparser_rte or HTMLparser_db<transformations-tsconfig-processing-htmlparser>`
+in Page TSconfig:
+
+.. code-block:: typoscript
+
+       // For rtehtmlarea we need to use the entry parser because otherwise the p tags will
+       // be converted to linebreaks during the RTE transformation.
+       RTE.default.proc.entryHTMLparser_db {
+               stripEmptyTags = 1
+               stripEmptyTags.tags = p
+
+               // Since rtehtmlarea adds non breaking spaces in empty <p> tags we need to
+               // tell the parser that &nbsp; should be treated as an empty string:
+               stripEmptyTags.treatNonBreakingSpaceAsEmpty = 1
+       }
+
+**Hint!** Please note that the HTMLparser will strip all unknown tags by default. If you **only** want
+to strip empty tags, you need to set ``keepNonMatchedTags`` to TRUE or configure the allowed tags:
+
+.. code-block:: typoscript
+
+       stdWrap {
+               HTMLparser.keepNonMatchedTags = 1
+               HTMLparser.stripEmptyTags = 1
+               HTMLparser.stripEmptyTags.tags = h2, h3
+       }
+
+
+Impact
+======
+
+If the configuration is not set, the HtmlParser behaves like before so there is no
+impact to existing systems (unless they already have used the stripEmptyTags setting
+for whatever reason).
+
+
index 70249b4..49fba8f 100644 (file)
@@ -267,4 +267,51 @@ Value 2.2
                $this->assertSame($expected, $result);
        }
 
+       /**
+        * @return array
+        */
+       public function emptyTagsDataProvider() {
+               return array(
+                       array(0 , NULL, FALSE, '<h1></h1>', '<h1></h1>'),
+                       array(1 , NULL, FALSE, '<h1></h1>', ''),
+                       array(1 , NULL, FALSE, '<h1>hallo</h1>', '<h1>hallo</h1>'),
+                       array(1 , NULL, FALSE, '<h1 class="something"></h1>', ''),
+                       array(1 , NULL, FALSE, '<h1 class="something"></h1><h2></h2>', ''),
+                       array(1 , 'h2', FALSE, '<h1 class="something"></h1><h2></h2>', '<h1 class="something"></h1>'),
+                       array(1 , 'h2, h1', FALSE, '<h1 class="something"></h1><h2></h2>', ''),
+                       array(1 , NULL, FALSE, '<div><p></p></div>', ''),
+                       array(1 , NULL, FALSE, '<div><p>&nbsp;</p></div>', '<div><p>&nbsp;</p></div>'),
+                       array(1 , NULL, TRUE, '<div><p>&nbsp;&nbsp;</p></div>', ''),
+                       array(1 , NULL, TRUE, '<div>&nbsp;&nbsp;<p></p></div>', ''),
+                       array(1 , NULL, FALSE, '<div>Some content<p></p></div>', '<div>Some content</div>'),
+                       array(1 , NULL, TRUE, '<div>Some content<p></p></div>', '<div>Some content</div>'),
+                       array(1 , NULL, FALSE, '<div>Some content</div>', '<div>Some content</div>'),
+                       array(1 , NULL, TRUE, '<div>Some content</div>', '<div>Some content</div>'),
+                       array(1 , NULL, FALSE, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'),
+                       array(1 , NULL, TRUE, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'),
+               );
+       }
+
+       /**
+        * @test
+        * @dataProvider emptyTagsDataProvider
+        * @param bool $stripOn TRUE if stripping should be activated.
+        * @param string $tagList Comma seperated list of tags that should be stripped.
+        * @param bool $treatNonBreakingSpaceAsEmpty If TRUE &nbsp; will be considered empty.
+        * @param string $content The HTML code that should be modified.
+        * @param string $expectedResult The expected HTML code result.
+        */
+       public function stripEmptyTags($stripOn, $tagList, $treatNonBreakingSpaceAsEmpty, $content, $expectedResult) {
+               $tsConfig = array(
+                       'keepNonMatchedTags' => 1,
+                       'stripEmptyTags' => $stripOn,
+                       'stripEmptyTags.' => array(
+                               'tags' => $tagList,
+                               'treatNonBreakingSpaceAsEmpty' => $treatNonBreakingSpaceAsEmpty
+                       ),
+               );
+               $config = $this->subject->HTMLparserConfig($tsConfig);
+               $result = $this->subject->HTMLcleaner($content, $config[0], $config[1], $config[2], $config[3]);
+               $this->assertEquals($expectedResult, $result);
+       }
 }