[!!!][TASK] Deprecate useCacheHash/noCacheHash
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Html / RteHtmlParser.php
1 <?php
2 namespace TYPO3\CMS\Core\Html;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use Psr\Log\LoggerAwareInterface;
18 use Psr\Log\LoggerAwareTrait;
19 use TYPO3\CMS\Backend\Utility\BackendUtility;
20 use TYPO3\CMS\Core\LinkHandling\Exception\UnknownLinkHandlerException;
21 use TYPO3\CMS\Core\LinkHandling\LinkService;
22 use TYPO3\CMS\Core\Resource;
23 use TYPO3\CMS\Core\Type\File\ImageInfo;
24 use TYPO3\CMS\Core\Utility\GeneralUtility;
25
26 /**
27 * Class for parsing HTML for the Rich Text Editor. (also called transformations)
28 *
29 * Concerning line breaks:
30 * Regardless if LF (Unix-style) or CRLF (Windows) was put in, the HtmlParser works with LFs and migrates all
31 * line breaks to LFs internally, however when all transformations are done, all LFs are transformed to CRLFs.
32 * This means: RteHtmlParser always returns CRLFs to be maximum compatible with all formats.
33 */
34 class RteHtmlParser extends HtmlParser implements LoggerAwareInterface
35 {
36 use LoggerAwareTrait;
37
38 /**
39 * List of elements that are not wrapped into a "p" tag while doing the transformation.
40 * @var string
41 */
42 protected $blockElementList = 'DIV,TABLE,BLOCKQUOTE,PRE,UL,OL,H1,H2,H3,H4,H5,H6,ADDRESS,DL,DD,HEADER,SECTION,FOOTER,NAV,ARTICLE,ASIDE';
43
44 /**
45 * List of all tags that are allowed by default
46 * @var string
47 */
48 protected $defaultAllowedTagsList = 'b,i,u,a,img,br,div,center,pre,font,hr,sub,sup,p,strong,em,li,ul,ol,blockquote,strike,span,abbr,acronym,dfn';
49
50 /**
51 * Set this to the pid of the record manipulated by the class.
52 *
53 * @var int
54 */
55 protected $recPid = 0;
56
57 /**
58 * Element reference [table]:[field], eg. "tt_content:bodytext"
59 *
60 * @var string
61 */
62 protected $elRef = '';
63
64 /**
65 * Current Page TSconfig
66 *
67 * @var array
68 */
69 protected $tsConfig = [];
70
71 /**
72 * Set to the TSconfig options coming from Page TSconfig
73 *
74 * @var array
75 */
76 protected $procOptions = [];
77
78 /**
79 * Run-away brake for recursive calls.
80 *
81 * @var int
82 */
83 protected $TS_transform_db_safecounter = 100;
84
85 /**
86 * Data caching for processing function
87 *
88 * @var array
89 */
90 protected $getKeepTags_cache = [];
91
92 /**
93 * Storage of the allowed CSS class names in the RTE
94 *
95 * @var array
96 */
97 protected $allowedClasses = [];
98
99 /**
100 * A list of HTML attributes for <p> tags. Because <p> tags are wrapped currently in a special handling,
101 * they have a special place for configuration via 'proc.keepPDIVattribs'
102 *
103 * @var array
104 */
105 protected $allowedAttributesForParagraphTags = [
106 'class',
107 'align',
108 'id',
109 'title',
110 'dir',
111 'lang',
112 'xml:lang',
113 'itemscope',
114 'itemtype',
115 'itemprop'
116 ];
117
118 /**
119 * Any tags that are allowed outside of <p> sections - usually similar to the block elements
120 * plus some special tags like <hr> and <img> (if images are allowed).
121 * Completely overrideable via 'proc.allowTagsOutside'
122 *
123 * @var array
124 */
125 protected $allowedTagsOutsideOfParagraphs = [
126 'address',
127 'article',
128 'aside',
129 'blockquote',
130 'div',
131 'footer',
132 'header',
133 'hr',
134 'nav',
135 'section'
136 ];
137
138 /**
139 * Initialize, setting element reference and record PID
140 *
141 * @param string $elRef Element reference, eg "tt_content:bodytext
142 * @param int $recPid PID of the record (page id)
143 */
144 public function init($elRef = '', $recPid = 0)
145 {
146 $this->recPid = $recPid;
147 $this->elRef = $elRef;
148 }
149
150 /**********************************************
151 *
152 * Main function
153 *
154 **********************************************/
155 /**
156 * Transform value for RTE based on specConf in the direction specified by $direction (rte/db)
157 * This is the main function called from DataHandler and transfer data classes
158 *
159 * @param string $value Input value
160 * @param null $_ unused
161 * @param string $direction Direction of the transformation. Two keywords are allowed; "db" or "rte". If "db" it means the transformation will clean up content coming from the Rich Text Editor and goes into the database. The other direction, "rte", is of course when content is coming from database and must be transformed to fit the RTE.
162 * @param array $thisConfig Parsed TypoScript content configuring the RTE, probably coming from Page TSconfig.
163 * @return string Output value
164 */
165 public function RTE_transform($value, $_ = null, $direction = 'rte', $thisConfig = [])
166 {
167 $this->tsConfig = $thisConfig;
168 $this->procOptions = (array)$thisConfig['proc.'];
169 if (isset($this->procOptions['allowedClasses.'])) {
170 $this->allowedClasses = (array)$this->procOptions['allowedClasses.'];
171 } else {
172 $this->allowedClasses = GeneralUtility::trimExplode(',', $this->procOptions['allowedClasses'] ?? '', true);
173 }
174
175 // Dynamic configuration of blockElementList
176 if (!empty($this->procOptions['blockElementList'])) {
177 $this->blockElementList = $this->procOptions['blockElementList'];
178 }
179
180 // Define which attributes are allowed on <p> tags
181 if (isset($this->procOptions['allowAttributes.'])) {
182 $this->allowedAttributesForParagraphTags = $this->procOptions['allowAttributes.'];
183 }
184 // Override tags which are allowed outside of <p> tags
185 if (isset($this->procOptions['allowTagsOutside'])) {
186 if (!isset($this->procOptions['allowTagsOutside.'])) {
187 $this->allowedTagsOutsideOfParagraphs = GeneralUtility::trimExplode(',', strtolower($this->procOptions['allowTagsOutside']), true);
188 } else {
189 $this->allowedTagsOutsideOfParagraphs = (array)$this->procOptions['allowTagsOutside.'];
190 }
191 }
192
193 // Setting modes / transformations to be called
194 if ((string)$this->procOptions['overruleMode'] !== '') {
195 $modes = GeneralUtility::trimExplode(',', $this->procOptions['overruleMode']);
196 } else {
197 $modes = [$this->procOptions['mode']];
198 }
199 $modes = $this->resolveAppliedTransformationModes($direction, $modes);
200
201 $value = $this->streamlineLineBreaksForProcessing($value);
202
203 // If an entry HTML cleaner was configured, pass the content through the HTMLcleaner
204 $value = $this->runHtmlParserIfConfigured($value, 'entryHTMLparser_' . $direction);
205
206 // Traverse modes
207 foreach ($modes as $cmd) {
208 if ($direction === 'db') {
209 // Checking for user defined transformation:
210 if (!empty($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd])) {
211 $_procObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd]);
212 $_procObj->pObj = $this;
213 $_procObj->transformationKey = $cmd;
214 $value = $_procObj->transform_db($value, $this);
215 } else {
216 // ... else use defaults:
217 switch ($cmd) {
218 case 'detectbrokenlinks':
219 $value = $this->removeBrokenLinkMarkers($value);
220 break;
221 case 'ts_images':
222 $value = $this->TS_images_db($value);
223 break;
224 case 'ts_links':
225 $value = $this->TS_links_db($value);
226 break;
227 case 'css_transform':
228 // Transform empty paragraphs into spacing paragraphs
229 $value = str_replace('<p></p>', '<p>&nbsp;</p>', $value);
230 // Double any trailing spacing paragraph so that it does not get removed by divideIntoLines()
231 $value = preg_replace('/<p>&nbsp;<\/p>$/', '<p>&nbsp;</p><p>&nbsp;</p>', $value);
232 $value = $this->TS_transform_db($value);
233 break;
234 default:
235 // Do nothing
236 }
237 }
238 } elseif ($direction === 'rte') {
239 // Checking for user defined transformation:
240 if (!empty($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd])) {
241 $_procObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd]);
242 $_procObj->pObj = $this;
243 $value = $_procObj->transform_rte($value, $this);
244 } else {
245 // ... else use defaults:
246 switch ($cmd) {
247 case 'detectbrokenlinks':
248 $value = $this->markBrokenLinks($value);
249 break;
250 case 'ts_images':
251 $value = $this->TS_images_rte($value);
252 break;
253 case 'css_transform':
254 $value = $this->TS_transform_rte($value);
255 break;
256 default:
257 // Do nothing
258 }
259 }
260 }
261 }
262
263 // If an exit HTML cleaner was configured, pass the content through the HTMLcleaner
264 $value = $this->runHtmlParserIfConfigured($value, 'exitHTMLparser_' . $direction);
265
266 // Final clean up of linebreaks
267 $value = $this->streamlineLineBreaksAfterProcessing($value);
268
269 return $value;
270 }
271
272 /**
273 * Ensures what transformation modes should be executed, and that they are only executed once.
274 *
275 * @param string $direction
276 * @param array $modes
277 * @return array the resolved transformation modes
278 */
279 protected function resolveAppliedTransformationModes(string $direction, array $modes)
280 {
281 $modeList = implode(',', $modes);
282
283 // Replace the shortcut "default" with all custom modes
284 $modeList = str_replace('default', 'detectbrokenlinks,css_transform,ts_images,ts_links', $modeList);
285
286 // Make list unique
287 $modes = array_unique(GeneralUtility::trimExplode(',', $modeList, true));
288 // Reverse order if direction is "rte"
289 if ($direction === 'rte') {
290 $modes = array_reverse($modes);
291 }
292
293 return $modes;
294 }
295
296 /**
297 * Runs the HTML parser if it is configured
298 * Getting additional HTML cleaner configuration. These are applied either before or after the main transformation
299 * is done and thus totally independent processing options you can set up.
300 *
301 * This is only possible via TSconfig (procOptions) currently.
302 *
303 * @param string $content
304 * @param string $configurationDirective used to look up in the procOptions if enabled, and then fetch the
305 * @return string the processed content
306 */
307 protected function runHtmlParserIfConfigured($content, $configurationDirective)
308 {
309 if (!empty($this->procOptions[$configurationDirective])) {
310 list($keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration) = $this->HTMLparserConfig($this->procOptions[$configurationDirective . '.']);
311 $content = $this->HTMLcleaner($content, $keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration);
312 }
313 return $content;
314 }
315
316 /************************************
317 *
318 * Specific RTE TRANSFORMATION functions
319 *
320 *************************************/
321 /**
322 * Transformation handler: 'ts_images' / direction: "db"
323 * Processing images inserted in the RTE.
324 * This is used when content goes from the RTE to the database.
325 * Images inserted in the RTE has an absolute URL applied to the src attribute. This URL is converted to a relative URL
326 * If it turns out that the URL is from another website than the current the image is read from that external URL and moved to the local server.
327 * Also "magic" images are processed here.
328 *
329 * @param string $value The content from RTE going to Database
330 * @return string Processed content
331 */
332 protected function TS_images_db($value)
333 {
334 // Split content by <img> tags and traverse the resulting array for processing:
335 $imgSplit = $this->splitTags('img', $value);
336 if (count($imgSplit) > 1) {
337 $siteUrl = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
338 $sitePath = str_replace(GeneralUtility::getIndpEnv('TYPO3_REQUEST_HOST'), '', $siteUrl);
339 /** @var Resource\ResourceFactory $resourceFactory */
340 $resourceFactory = Resource\ResourceFactory::getInstance();
341 /** @var Resource\Service\MagicImageService $magicImageService */
342 $magicImageService = GeneralUtility::makeInstance(Resource\Service\MagicImageService::class);
343 $magicImageService->setMagicImageMaximumDimensions($this->tsConfig);
344 foreach ($imgSplit as $k => $v) {
345 // Image found, do processing:
346 if ($k % 2) {
347 // Get attributes
348 list($attribArray) = $this->get_tag_attributes($v, true);
349 // It's always an absolute URL coming from the RTE into the Database.
350 $absoluteUrl = trim($attribArray['src']);
351 // Make path absolute if it is relative and we have a site path which is not '/'
352 $pI = pathinfo($absoluteUrl);
353 if ($sitePath && !$pI['scheme'] && GeneralUtility::isFirstPartOfStr($absoluteUrl, $sitePath)) {
354 // If site is in a subpath (eg. /~user_jim/) this path needs to be removed because it will be added with $siteUrl
355 $absoluteUrl = substr($absoluteUrl, strlen($sitePath));
356 $absoluteUrl = $siteUrl . $absoluteUrl;
357 }
358 // Image dimensions set in the img tag, if any
359 $imgTagDimensions = $this->getWHFromAttribs($attribArray);
360 if ($imgTagDimensions[0]) {
361 $attribArray['width'] = $imgTagDimensions[0];
362 }
363 if ($imgTagDimensions[1]) {
364 $attribArray['height'] = $imgTagDimensions[1];
365 }
366 $originalImageFile = null;
367 if ($attribArray['data-htmlarea-file-uid']) {
368 // An original image file uid is available
369 try {
370 /** @var Resource\File $originalImageFile */
371 $originalImageFile = $resourceFactory->getFileObject((int)$attribArray['data-htmlarea-file-uid']);
372 } catch (Resource\Exception\FileDoesNotExistException $fileDoesNotExistException) {
373 // Log the fact the file could not be retrieved.
374 $message = sprintf('Could not find file with uid "%s"', $attribArray['data-htmlarea-file-uid']);
375 $this->logger->error($message);
376 }
377 }
378 if ($originalImageFile instanceof Resource\File) {
379 // Public url of local file is relative to the site url, absolute otherwise
380 if ($absoluteUrl == $originalImageFile->getPublicUrl() || $absoluteUrl == $siteUrl . $originalImageFile->getPublicUrl()) {
381 // This is a plain image, i.e. reference to the original image
382 if ($this->procOptions['plainImageMode']) {
383 // "plain image mode" is configured
384 // Find the dimensions of the original image
385 $imageInfo = [
386 $originalImageFile->getProperty('width'),
387 $originalImageFile->getProperty('height')
388 ];
389 if (!$imageInfo[0] || !$imageInfo[1]) {
390 $filePath = $originalImageFile->getForLocalProcessing(false);
391 $imageInfoObject = GeneralUtility::makeInstance(ImageInfo::class, $filePath);
392 $imageInfo = [
393 $imageInfoObject->getWidth(),
394 $imageInfoObject->getHeight()
395 ];
396 }
397 $attribArray = $this->applyPlainImageModeSettings($imageInfo, $attribArray);
398 }
399 } else {
400 // Magic image case: get a processed file with the requested configuration
401 $imageConfiguration = [
402 'width' => $imgTagDimensions[0],
403 'height' => $imgTagDimensions[1]
404 ];
405 $magicImage = $magicImageService->createMagicImage($originalImageFile, $imageConfiguration);
406 $attribArray['width'] = $magicImage->getProperty('width');
407 $attribArray['height'] = $magicImage->getProperty('height');
408 $attribArray['src'] = $magicImage->getPublicUrl();
409 }
410 } elseif (!GeneralUtility::isFirstPartOfStr($absoluteUrl, $siteUrl) && !$this->procOptions['dontFetchExtPictures'] && TYPO3_MODE === 'BE') {
411 // External image from another URL: in that case, fetch image, unless the feature is disabled or we are not in backend mode
412 // Fetch the external image
413 $externalFile = GeneralUtility::getUrl($absoluteUrl);
414 if ($externalFile) {
415 $pU = parse_url($absoluteUrl);
416 $pI = pathinfo($pU['path']);
417 $extension = strtolower($pI['extension']);
418 if ($extension === 'jpg' || $extension === 'jpeg' || $extension === 'gif' || $extension === 'png') {
419 $fileName = GeneralUtility::shortMD5($absoluteUrl) . '.' . $pI['extension'];
420 // We insert this image into the user default upload folder
421 list($table, $field) = explode(':', $this->elRef);
422 /** @var Resource\Folder $folder */
423 $folder = $GLOBALS['BE_USER']->getDefaultUploadFolder($this->recPid, $table, $field);
424 /** @var Resource\File $fileObject */
425 $fileObject = $folder->createFile($fileName)->setContents($externalFile);
426 $imageConfiguration = [
427 'width' => $attribArray['width'],
428 'height' => $attribArray['height']
429 ];
430 $magicImage = $magicImageService->createMagicImage($fileObject, $imageConfiguration);
431 $attribArray['width'] = $magicImage->getProperty('width');
432 $attribArray['height'] = $magicImage->getProperty('height');
433 $attribArray['data-htmlarea-file-uid'] = $fileObject->getUid();
434 $attribArray['src'] = $magicImage->getPublicUrl();
435 }
436 }
437 } elseif (GeneralUtility::isFirstPartOfStr($absoluteUrl, $siteUrl)) {
438 // Finally, check image as local file (siteURL equals the one of the image)
439 // Image has no data-htmlarea-file-uid attribute
440 // Relative path, rawurldecoded for special characters.
441 $path = rawurldecode(substr($absoluteUrl, strlen($siteUrl)));
442 // Absolute filepath, locked to relative path of this project
443 $filepath = GeneralUtility::getFileAbsFileName($path);
444 // Check file existence (in relative directory to this installation!)
445 if ($filepath && @is_file($filepath)) {
446 // Treat it as a plain image
447 if ($this->procOptions['plainImageMode']) {
448 // If "plain image mode" has been configured
449 // Find the original dimensions of the image
450 $imageInfoObject = GeneralUtility::makeInstance(ImageInfo::class, $filepath);
451 $imageInfo = [
452 $imageInfoObject->getWidth(),
453 $imageInfoObject->getHeight()
454 ];
455 $attribArray = $this->applyPlainImageModeSettings($imageInfo, $attribArray);
456 }
457 // Let's try to find a file uid for this image
458 try {
459 $fileOrFolderObject = $resourceFactory->retrieveFileOrFolderObject($path);
460 if ($fileOrFolderObject instanceof Resource\FileInterface) {
461 $fileIdentifier = $fileOrFolderObject->getIdentifier();
462 /** @var Resource\AbstractFile $fileObject */
463 $fileObject = $fileOrFolderObject->getStorage()->getFile($fileIdentifier);
464 // @todo if the retrieved file is a processed file, get the original file...
465 $attribArray['data-htmlarea-file-uid'] = $fileObject->getUid();
466 }
467 } catch (Resource\Exception\ResourceDoesNotExistException $resourceDoesNotExistException) {
468 // Nothing to be done if file/folder not found
469 }
470 }
471 }
472 // Remove width and height from style attribute
473 $attribArray['style'] = preg_replace('/(?:^|[^-])(\\s*(?:width|height)\\s*:[^;]*(?:$|;))/si', '', $attribArray['style']);
474 // Must have alt attribute
475 if (!isset($attribArray['alt'])) {
476 $attribArray['alt'] = '';
477 }
478 // Convert absolute to relative url
479 if (GeneralUtility::isFirstPartOfStr($attribArray['src'], $siteUrl)) {
480 $attribArray['src'] = substr($attribArray['src'], strlen($siteUrl));
481 }
482 $imgSplit[$k] = '<img ' . GeneralUtility::implodeAttributes($attribArray, true, true) . ' />';
483 }
484 }
485 }
486 return implode('', $imgSplit);
487 }
488
489 /**
490 * Transformation handler: 'ts_images' / direction: "rte"
491 * Processing images from database content going into the RTE.
492 * Processing includes converting the src attribute to an absolute URL.
493 *
494 * @param string $value Content input
495 * @return string Content output
496 */
497 public function TS_images_rte($value)
498 {
499 // Split content by <img> tags and traverse the resulting array for processing:
500 $imgSplit = $this->splitTags('img', $value);
501 if (count($imgSplit) > 1) {
502 $siteUrl = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
503 $sitePath = str_replace(GeneralUtility::getIndpEnv('TYPO3_REQUEST_HOST'), '', $siteUrl);
504 foreach ($imgSplit as $k => $v) {
505 // Image found
506 if ($k % 2) {
507 // Get the attributes of the img tag
508 list($attribArray) = $this->get_tag_attributes($v, true);
509 $absoluteUrl = trim($attribArray['src']);
510 // Transform the src attribute into an absolute url, if it not already
511 if (stripos($absoluteUrl, 'http') !== 0) {
512 // If site is in a subpath (eg. /~user_jim/) this path needs to be removed because it will be added with $siteUrl
513 $attribArray['src'] = preg_replace('#^' . preg_quote($sitePath, '#') . '#', '', $attribArray['src']);
514 $attribArray['src'] = $siteUrl . $attribArray['src'];
515 }
516 // Must have alt attribute
517 if (!isset($attribArray['alt'])) {
518 $attribArray['alt'] = '';
519 }
520 $imgSplit[$k] = '<img ' . GeneralUtility::implodeAttributes($attribArray, true, true) . ' />';
521 }
522 }
523 }
524 // Return processed content:
525 return implode('', $imgSplit);
526 }
527
528 /**
529 * Transformation handler: 'ts_links' / direction: "db"
530 * Processing anchor tags, and resolves them correctly again via the LinkService syntax
531 *
532 * Splits content into <a> tag blocks and processes each tag, and allows hooks to actually render
533 * the result.
534 *
535 * @param string $value Content input
536 * @return string Content output
537 */
538 protected function TS_links_db($value)
539 {
540 $blockSplit = $this->splitIntoBlock('A', $value);
541 foreach ($blockSplit as $k => $v) {
542 if ($k % 2) {
543 list($tagAttributes) = $this->get_tag_attributes($this->getFirstTag($v), true);
544 $linkService = GeneralUtility::makeInstance(LinkService::class);
545 $linkInformation = $linkService->resolve($tagAttributes['href'] ?? '');
546
547 // Store the link as <a> tag as default by TYPO3, with the link service syntax
548 try {
549 $tagAttributes['href'] = $linkService->asString($linkInformation);
550 } catch (UnknownLinkHandlerException $e) {
551 $tagAttributes['href'] = $linkInformation['href'] ?? $tagAttributes['href'];
552 }
553
554 $blockSplit[$k] = '<a ' . GeneralUtility::implodeAttributes($tagAttributes, true) . '>'
555 . $this->TS_links_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</a>';
556 }
557 }
558 return implode('', $blockSplit);
559 }
560
561 /**
562 * Transformation handler: 'css_transform' / direction: "db"
563 * Cleaning (->db) for standard content elements (ts)
564 *
565 * @param string $value Content input
566 * @return string Content output
567 * @see TS_transform_rte()
568 */
569 protected function TS_transform_db($value)
570 {
571 // Safety... so forever loops are avoided (they should not occur, but an error would potentially do this...)
572 $this->TS_transform_db_safecounter--;
573 if ($this->TS_transform_db_safecounter < 0) {
574 return $value;
575 }
576 // Split the content from RTE by the occurrence of these blocks:
577 $blockSplit = $this->splitIntoBlock($this->blockElementList, $value);
578
579 // Avoid superfluous linebreaks by transform_db after ending headListTag
580 while (count($blockSplit) > 0 && trim(end($blockSplit)) === '') {
581 array_pop($blockSplit);
582 }
583
584 // Traverse the blocks
585 foreach ($blockSplit as $k => $v) {
586 if ($k % 2) {
587 // Inside block:
588 // Init:
589 $tag = $this->getFirstTag($v);
590 $tagName = strtolower($this->getFirstTagName($v));
591 // Process based on the tag:
592 switch ($tagName) {
593 case 'blockquote':
594 case 'dd':
595 case 'div':
596 case 'header':
597 case 'section':
598 case 'footer':
599 case 'nav':
600 case 'article':
601 case 'aside':
602 $blockSplit[$k] = $tag . $this->TS_transform_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
603 break;
604 case 'pre':
605 break;
606 default:
607 // usually <hx> tags and <table> tags where no other block elements are within the tags
608 // Eliminate true linebreaks inside block element tags
609 $blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]);
610 }
611 } else {
612 // NON-block:
613 if (trim($blockSplit[$k]) !== '') {
614 $blockSplit[$k] = str_replace('<hr/>', '<hr />', $blockSplit[$k]);
615 // Remove linebreaks preceding hr tags
616 $blockSplit[$k] = preg_replace('/[' . LF . ']+<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/', '<$1$2/>', $blockSplit[$k]);
617 // Remove linebreaks following hr tags
618 $blockSplit[$k] = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>[' . LF . ']+/', '<$1$2/>', $blockSplit[$k]);
619 // Replace other linebreaks with space
620 $blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]);
621 $blockSplit[$k] = $this->divideIntoLines($blockSplit[$k]);
622 } else {
623 unset($blockSplit[$k]);
624 }
625 }
626 }
627 $this->TS_transform_db_safecounter++;
628 return implode(LF, $blockSplit);
629 }
630
631 /**
632 * Transformation handler: css_transform / direction: "rte"
633 * Set (->rte) for standard content elements (ts)
634 *
635 * @param string $value Content input
636 * @return string Content output
637 * @see TS_transform_db()
638 */
639 protected function TS_transform_rte($value)
640 {
641 // Split the content from database by the occurrence of the block elements
642 $blockSplit = $this->splitIntoBlock($this->blockElementList, $value);
643 // Traverse the blocks
644 foreach ($blockSplit as $k => $v) {
645 if ($k % 2) {
646 // Inside one of the blocks:
647 // Init:
648 $tag = $this->getFirstTag($v);
649 $tagName = strtolower($this->getFirstTagName($v));
650 // Based on tagname, we do transformations:
651 switch ($tagName) {
652 case 'blockquote':
653 case 'dd':
654 case 'div':
655 case 'header':
656 case 'section':
657 case 'footer':
658 case 'nav':
659 case 'article':
660 case 'aside':
661 $blockSplit[$k] = $tag . $this->TS_transform_rte($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
662 break;
663 }
664 $blockSplit[$k + 1] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k + 1]);
665 } else {
666 // NON-block:
667 $nextFTN = $this->getFirstTagName($blockSplit[$k + 1] ?? '');
668 $onlyLineBreaks = (preg_match('/^[ ]*' . LF . '+[ ]*$/', $blockSplit[$k]) == 1);
669 // If the line is followed by a block or is the last line:
670 if (GeneralUtility::inList($this->blockElementList, $nextFTN) || !isset($blockSplit[$k + 1])) {
671 // If the line contains more than just linebreaks, reduce the number of trailing linebreaks by 1
672 if (!$onlyLineBreaks) {
673 $blockSplit[$k] = preg_replace('/(' . LF . '*)' . LF . '[ ]*$/', '$1', $blockSplit[$k]);
674 } else {
675 // If the line contains only linebreaks, remove the leading linebreak
676 $blockSplit[$k] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k]);
677 }
678 }
679 // If $blockSplit[$k] is blank then unset the line, unless the line only contained linebreaks
680 if ((string)$blockSplit[$k] === '' && !$onlyLineBreaks) {
681 unset($blockSplit[$k]);
682 } else {
683 $blockSplit[$k] = $this->setDivTags($blockSplit[$k]);
684 }
685 }
686 }
687 return implode(LF, $blockSplit);
688 }
689
690 /***************************************************************
691 *
692 * Generic RTE transformation, analysis and helper functions
693 *
694 **************************************************************/
695
696 /**
697 * Function for cleaning content going into the database.
698 * Content is cleaned eg. by removing unallowed HTML and ds-HSC content
699 * It is basically calling HTMLcleaner from the parent class with some preset configuration specifically set up for cleaning content going from the RTE into the db
700 *
701 * @param string $content Content to clean up
702 * @return string Clean content
703 * @see getKeepTags()
704 */
705 protected function HTMLcleaner_db($content)
706 {
707 $keepTags = $this->getKeepTags('db');
708 return $this->HTMLcleaner($content, $keepTags, false);
709 }
710
711 /**
712 * Creates an array of configuration for the HTMLcleaner function based on whether content
713 * go TO or FROM the Rich Text Editor ($direction)
714 *
715 * @param string $direction The direction of the content being processed by the output configuration; "db" (content going into the database FROM the rte) or "rte" (content going into the form)
716 * @return array Configuration array
717 * @see HTMLcleaner_db()
718 */
719 protected function getKeepTags($direction = 'rte')
720 {
721 if (!isset($this->getKeepTags_cache[$direction]) || !is_array($this->getKeepTags_cache[$direction])) {
722 // Setting up allowed tags:
723 // Default is to get allowed/denied tags from internal array of processing options:
724 // Construct default list of tags to keep:
725 if (isset($this->procOptions['allowTags.']) && is_array($this->procOptions['allowTags.'])) {
726 $keepTags = implode(',', $this->procOptions['allowTags.']);
727 } else {
728 $keepTags = $this->procOptions['allowTags'] ?? '';
729 }
730 $keepTags = array_flip(GeneralUtility::trimExplode(',', $this->defaultAllowedTagsList . ',' . strtolower($keepTags), true));
731 // For tags to deny, remove them from $keepTags array:
732 $denyTags = GeneralUtility::trimExplode(',', $this->procOptions['denyTags'] ?? '', true);
733 foreach ($denyTags as $dKe) {
734 unset($keepTags[$dKe]);
735 }
736 // Based on the direction of content, set further options:
737 switch ($direction) {
738 case 'rte':
739 // Transforming keepTags array so it can be understood by the HTMLcleaner function.
740 // This basically converts the format of the array from TypoScript (having dots) to plain multi-dimensional array.
741 list($keepTags) = $this->HTMLparserConfig($this->procOptions['HTMLparser_rte.'] ?? [], $keepTags);
742 break;
743 case 'db':
744 // Setting up span tags if they are allowed:
745 if (isset($keepTags['span'])) {
746 $keepTags['span'] = [
747 'allowedAttribs' => 'id,class,style,title,lang,xml:lang,dir,itemscope,itemtype,itemprop',
748 'fixAttrib' => [
749 'class' => [
750 'removeIfFalse' => 1
751 ]
752 ],
753 'rmTagIfNoAttrib' => 1
754 ];
755 if (!empty($this->allowedClasses)) {
756 $keepTags['span']['fixAttrib']['class']['list'] = $this->allowedClasses;
757 }
758 }
759 // Setting further options, getting them from the processing options
760 $TSc = $this->procOptions['HTMLparser_db.'] ?? [];
761 if (empty($TSc['globalNesting'])) {
762 $TSc['globalNesting'] = 'b,i,u,a,center,font,sub,sup,strong,em,strike,span';
763 }
764 if (empty($TSc['noAttrib'])) {
765 $TSc['noAttrib'] = 'b,i,u,br,center,hr,sub,sup,strong,em,li,ul,ol,blockquote,strike';
766 }
767 // Transforming the array from TypoScript to regular array:
768 list($keepTags) = $this->HTMLparserConfig($TSc, $keepTags);
769 break;
770 }
771 // Caching (internally, in object memory) the result
772 $this->getKeepTags_cache[$direction] = $keepTags;
773 }
774 // Return result:
775 return $this->getKeepTags_cache[$direction];
776 }
777
778 /**
779 * This resolves the $value into parts based on <p>-sections. These are returned as lines separated by LF.
780 * This point is to resolve the HTML-code returned from RTE into ordinary lines so it's 'human-readable'
781 * The function ->setDivTags does the opposite.
782 * This function processes content to go into the database.
783 *
784 * @param string $value Value to process.
785 * @param int $count Recursion brake. Decremented on each recursion down to zero. Default is 5 (which equals the allowed nesting levels of p tags).
786 * @param bool $returnArray If TRUE, an array with the lines is returned, otherwise a string of the processed input value.
787 * @return string|array Processed input value.
788 * @see setDivTags()
789 */
790 protected function divideIntoLines($value, $count = 5, $returnArray = false)
791 {
792 // Setting the third param will eliminate false end-tags. Maybe this is a good thing to do...?
793 $paragraphBlocks = $this->splitIntoBlock('p', $value, true);
794 // Returns plainly the content if there was no p sections in it
795 if (count($paragraphBlocks) <= 1 || $count <= 0) {
796 return $this->sanitizeLineBreaksForContentOnly($value);
797 }
798
799 // Traverse the splitted sections
800 foreach ($paragraphBlocks as $k => $v) {
801 if ($k % 2) {
802 // Inside a <p> section
803 $v = $this->removeFirstAndLastTag($v);
804 // Fetching 'sub-lines' - which will explode any further p nesting recursively
805 $subLines = $this->divideIntoLines($v, $count - 1, true);
806 // So, if there happened to be sub-nesting of p, this is written directly as the new content of THIS section. (This would be considered 'an error')
807 if (is_array($subLines)) {
808 $paragraphBlocks[$k] = implode(LF, $subLines);
809 } else {
810 //... but if NO subsection was found, we process it as a TRUE line without erroneous content:
811 $paragraphBlocks[$k] = $this->processContentWithinParagraph($subLines, $paragraphBlocks[$k]);
812 }
813 // If it turns out the line is just blank (containing a &nbsp; possibly) then just make it pure blank.
814 // But, prevent filtering of lines that are blank in sense above, but whose tags contain attributes.
815 // Those attributes should have been filtered before; if they are still there they must be considered as possible content.
816 if (trim(strip_tags($paragraphBlocks[$k])) === '&nbsp;' && !preg_match('/\\<(img)(\\s[^>]*)?\\/?>/si', $paragraphBlocks[$k]) && !preg_match('/\\<([^>]*)?( align| class| style| id| title| dir| lang| xml:lang)([^>]*)?>/si', trim($paragraphBlocks[$k]))) {
817 $paragraphBlocks[$k] = '';
818 }
819 } else {
820 // Outside a paragraph, if there is still something in there, just add a <p> tag
821 // Remove positions which are outside <p> tags and without content
822 $paragraphBlocks[$k] = trim(strip_tags($paragraphBlocks[$k], '<' . implode('><', $this->allowedTagsOutsideOfParagraphs) . '>'));
823 $paragraphBlocks[$k] = $this->sanitizeLineBreaksForContentOnly($paragraphBlocks[$k]);
824 if ((string)$paragraphBlocks[$k] === '') {
825 unset($paragraphBlocks[$k]);
826 } else {
827 // add <p> tags around the content
828 $paragraphBlocks[$k] = str_replace(strip_tags($paragraphBlocks[$k]), '<p>' . strip_tags($paragraphBlocks[$k]) . '</p>', $paragraphBlocks[$k]);
829 }
830 }
831 }
832 return $returnArray ? $paragraphBlocks : implode(LF, $paragraphBlocks);
833 }
834
835 /**
836 * Converts all lines into <p></p>-sections (unless the line has a p - tag already)
837 * For processing of content going FROM database TO RTE.
838 *
839 * @param string $value Value to convert
840 * @return string Processed value.
841 * @see divideIntoLines()
842 */
843 protected function setDivTags($value)
844 {
845 // First, setting configuration for the HTMLcleaner function. This will process each line between the <div>/<p> section on their way to the RTE
846 $keepTags = $this->getKeepTags('rte');
847 // Divide the content into lines
848 $parts = explode(LF, $value);
849 foreach ($parts as $k => $v) {
850 // Processing of line content:
851 // If the line is blank, set it to &nbsp;
852 if (trim($parts[$k]) === '') {
853 $parts[$k] = '&nbsp;';
854 } else {
855 // Clean the line content, keeping unknown tags (as they can be removed in the entryHTMLparser)
856 $parts[$k] = $this->HTMLcleaner($parts[$k], $keepTags, 'protect');
857 // convert double-encoded &nbsp; into regular &nbsp; however this could also be reversed via the exitHTMLparser
858 // This was previously an option to disable called "dontConvAmpInNBSP_rte"
859 $parts[$k] = str_replace('&amp;nbsp;', '&nbsp;', $parts[$k]);
860 }
861 // Wrapping the line in <p> tags if not already wrapped and does not contain an hr tag
862 if (!preg_match('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', $parts[$k])) {
863 $testStr = strtolower(trim($parts[$k]));
864 if (strpos($testStr, '<div') !== 0 || substr($testStr, -6) !== '</div>') {
865 if (strpos($testStr, '<p') !== 0 || substr($testStr, -4) !== '</p>') {
866 // Only set p-tags if there is not already div or p tags:
867 $parts[$k] = '<p>' . $parts[$k] . '</p>';
868 }
869 }
870 }
871 }
872 // Implode result:
873 return implode(LF, $parts);
874 }
875
876 /**
877 * Used for transformation from RTE to DB
878 *
879 * Works on a single line within a <p> tag when storing into the database
880 * This always adds <p> tags and validates the arguments,
881 * additionally the content is cleaned up via the HTMLcleaner.
882 *
883 * @param string $content the content within the <p> tag
884 * @param string $fullContentWithTag the whole <p> tag surrounded as well
885 *
886 * @return string the full <p> tag with cleaned content
887 */
888 protected function processContentWithinParagraph(string $content, string $fullContentWithTag)
889 {
890 // clean up the content
891 $content = $this->HTMLcleaner_db($content);
892 // Get the <p> tag, and validate the attributes
893 $fTag = $this->getFirstTag($fullContentWithTag);
894 // Check which attributes of the <p> tag to keep attributes
895 if (!empty($this->allowedAttributesForParagraphTags)) {
896 list($tagAttributes) = $this->get_tag_attributes($fTag);
897 // Make sure the tag attributes only contain the ones that are defined to be allowed
898 $tagAttributes = array_intersect_key($tagAttributes, array_flip($this->allowedAttributesForParagraphTags));
899
900 // Only allow classes that are whitelisted in $this->allowedClasses
901 if (isset($tagAttributes['class']) && trim($tagAttributes['class']) !== '' && !empty($this->allowedClasses) && !in_array($tagAttributes['class'], $this->allowedClasses, true)) {
902 $classes = GeneralUtility::trimExplode(' ', $tagAttributes['class'], true);
903 $classes = array_intersect($classes, $this->allowedClasses);
904 if (!empty($classes)) {
905 $tagAttributes['class'] = implode(' ', $classes);
906 } else {
907 unset($tagAttributes['class']);
908 }
909 }
910 } else {
911 $tagAttributes = [];
912 }
913 // Remove any line break
914 $content = str_replace(LF, '', $content);
915 // Compile the surrounding <p> tag
916 $content = '<' . rtrim('p ' . $this->compileTagAttribs($tagAttributes)) . '>' . $content . '</p>';
917 return $content;
918 }
919
920 /**
921 * Wrap <hr> tags with LFs, and also remove double LFs, used when transforming from RTE to DB
922 *
923 * @param string $content
924 * @return string the modified content
925 */
926 protected function sanitizeLineBreaksForContentOnly(string $content)
927 {
928 $content = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', LF . '<$1$2/>' . LF, $content);
929 $content = str_replace(LF . LF, LF, $content);
930 $content = preg_replace('/(^' . LF . ')|(' . LF . '$)/i', '', $content);
931 return $content;
932 }
933
934 /**
935 * Finds width and height from attrib-array
936 * If the width and height is found in the style-attribute, use that!
937 *
938 * @param array $attribArray Array of attributes from tag in which to search. More specifically the content of the key "style" is used to extract "width:xxx / height:xxx" information
939 * @return array Integer w/h in key 0/1. Zero is returned if not found.
940 */
941 protected function getWHFromAttribs($attribArray)
942 {
943 $style = trim($attribArray['style']);
944 $w = 0;
945 $h = 0;
946 if ($style) {
947 $regex = '[[:space:]]*:[[:space:]]*([0-9]*)[[:space:]]*px';
948 // Width
949 $reg = [];
950 preg_match('/width' . $regex . '/i', $style, $reg);
951 $w = (int)$reg[1];
952 // Height
953 preg_match('/height' . $regex . '/i', $style, $reg);
954 $h = (int)$reg[1];
955 }
956 if (!$w) {
957 $w = $attribArray['width'];
958 }
959 if (!$h) {
960 $h = $attribArray['height'];
961 }
962 return [(int)$w, (int)$h];
963 }
964
965 /**
966 * Apply plain image settings to the dimensions of the image
967 *
968 * @param array $imageInfo: info array of the image
969 * @param array $attribArray: array of attributes of an image tag
970 *
971 * @return array a modified attributes array
972 */
973 protected function applyPlainImageModeSettings($imageInfo, $attribArray)
974 {
975 if ($this->procOptions['plainImageMode']) {
976 // Perform corrections to aspect ratio based on configuration
977 switch ((string)$this->procOptions['plainImageMode']) {
978 case 'lockDimensions':
979 $attribArray['width'] = $imageInfo[0];
980 $attribArray['height'] = $imageInfo[1];
981 break;
982 case 'lockRatioWhenSmaller':
983 if ($attribArray['width'] > $imageInfo[0]) {
984 $attribArray['width'] = $imageInfo[0];
985 }
986 if ($imageInfo[0] > 0) {
987 $attribArray['height'] = round($attribArray['width'] * ($imageInfo[1] / $imageInfo[0]));
988 }
989 break;
990 case 'lockRatio':
991 if ($imageInfo[0] > 0) {
992 $attribArray['height'] = round($attribArray['width'] * ($imageInfo[1] / $imageInfo[0]));
993 }
994 break;
995 }
996 }
997 return $attribArray;
998 }
999
1000 /**
1001 * Called before any processing / transformation is made
1002 * Removing any CRs (char 13) and only deal with LFs (char 10) internally.
1003 * CR has a very disturbing effect, so just remove all CR and rely on LF
1004 *
1005 * Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks.
1006 *
1007 * @param string $content the content to process
1008 * @return string the modified content
1009 */
1010 protected function streamlineLineBreaksForProcessing(string $content)
1011 {
1012 return str_replace(CR, '', $content);
1013 }
1014
1015 /**
1016 * Called after any processing / transformation was made
1017 * just before the content is returned by the RTE parser all line breaks
1018 * get unified to be "CRLF"s again.
1019 *
1020 * Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks.
1021 *
1022 * @param string $content the content to process
1023 * @return string the modified content
1024 */
1025 protected function streamlineLineBreaksAfterProcessing(string $content)
1026 {
1027 // Make sure no \r\n sequences has entered in the meantime
1028 $content = $this->streamlineLineBreaksForProcessing($content);
1029 // ... and then change all \n into \r\n
1030 return str_replace(LF, CRLF, $content);
1031 }
1032
1033 /**
1034 * Content Transformation from DB to RTE
1035 * Checks all <a> tags which reference a t3://page and checks if the page is available
1036 * If not, some offensive styling is added.
1037 *
1038 * @param string $content
1039 * @return string the modified content
1040 */
1041 protected function markBrokenLinks(string $content): string
1042 {
1043 $blocks = $this->splitIntoBlock('A', $content);
1044 $linkService = GeneralUtility::makeInstance(LinkService::class);
1045 foreach ($blocks as $position => $value) {
1046 if ($position % 2 === 0) {
1047 continue;
1048 }
1049 list($attributes) = $this->get_tag_attributes($this->getFirstTag($value), true);
1050 if (empty($attributes['href'])) {
1051 continue;
1052 }
1053 $hrefInformation = $linkService->resolve($attributes['href']);
1054 if ($hrefInformation['type'] === LinkService::TYPE_PAGE && $hrefInformation['pageuid'] !== 'current') {
1055 $pageRecord = BackendUtility::getRecord('pages', $hrefInformation['pageuid']);
1056 if (!is_array($pageRecord)) {
1057 // Page does not exist
1058 $attributes['data-rte-error'] = 'Page with ID ' . $hrefInformation['pageuid'] . ' not found';
1059 }
1060 }
1061 // Always rewrite the block to allow the nested calling even if a page is found
1062 $blocks[$position] =
1063 '<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
1064 . $this->markBrokenLinks($this->removeFirstAndLastTag($blocks[$position]))
1065 . '</a>';
1066 }
1067 return implode('', $blocks);
1068 }
1069
1070 /**
1071 * Content Transformation from RTE to DB
1072 * Removes link information error attributes from <a> tags that are added to broken links
1073 *
1074 * @param string $content the content to process
1075 * @return string the modified content
1076 */
1077 protected function removeBrokenLinkMarkers(string $content): string
1078 {
1079 $blocks = $this->splitIntoBlock('A', $content);
1080 foreach ($blocks as $position => $value) {
1081 if ($position % 2 === 0) {
1082 continue;
1083 }
1084 list($attributes) = $this->get_tag_attributes($this->getFirstTag($value), true);
1085 if (empty($attributes['href'])) {
1086 continue;
1087 }
1088 // Always remove the styling again (regardless of the page was found or not)
1089 // so the database does not contain ugly stuff
1090 unset($attributes['data-rte-error']);
1091 if (isset($attributes['style'])) {
1092 $attributes['style'] = trim(str_replace('background-color: yellow; border:2px red solid; color: black;', '', $attributes['style']));
1093 if (empty($attributes['style'])) {
1094 unset($attributes['style']);
1095 }
1096 }
1097 $blocks[$position] =
1098 '<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
1099 . $this->removeBrokenLinkMarkers($this->removeFirstAndLastTag($blocks[$position]))
1100 . '</a>';
1101 }
1102 return implode('', $blocks);
1103 }
1104 }