Commit 04791227 authored by Alexander Nitsche's avatar Alexander Nitsche
Browse files

[BUGFIX] Support parsing of XML files larger than 10 MB

EXT:impexp may run into memory limits when converting XML to PHP
arrays, even before all PHP memory is used up, because libxml has a
specific limit of 10 MB. This limit can be turned off by using the
`XML_PARSE_HUGE` (libxml) or `LIBXML_PARSEHUGE` (PHP) option, but
only for the DOMDocument implementation of libxml, and not for the
current XmlParser implementation.

By replacing the XmlParser implementation with a DOMDocument
implementation, larger XML files can be parsed with lower peak memory
consumption as a side effect. For example, parsing a 4 MB dummy XML
file consumes

56.03 MB (memory) / 168.72 MB (memory peak)

with the XmlParser, while using DOMDocument reduces the consumption to

56.15 MB (memory) / 60.08 MB (memory peak).

Besides the replacing of the implementation, XML parsing has been moved
to separate classes (XmlEncoder / XmlDecoder), fully covered by tests,
and restructured to reduce the number of required parameters to a
minimum. The functional scope was not reduced in any way.

Resolves: #83580
Releases: main, 11.5
Change-Id: Ic3345d539f028d766b49d01096ec34a6190a6dfe
parent d84b0641
Pipeline #22360 failed with stages
in 11 minutes and 43 seconds
<?php
declare(strict_types=1);
/*
* This file is part of the TYPO3 CMS project.
*
* It is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, either version 2
* of the License, or any later version.
*
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*
* The TYPO3 project - inspiring people to share!
*/
namespace TYPO3\CMS\Core\Encoder\Exception;
use TYPO3\CMS\Core\Exception;
/**
* An exception if something is wrong with the data to be encoded or decoded
*/
class InvalidDataException extends Exception
{
}
<?php
declare(strict_types=1);
/*
* This file is part of the TYPO3 CMS project.
*
* It is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, either version 2
* of the License, or any later version.
*
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*
* The TYPO3 project - inspiring people to share!
*/
namespace TYPO3\CMS\Core\Encoder;
use TYPO3\CMS\Core\Encoder\Exception\InvalidDataException;
use TYPO3\CMS\Core\Utility\MathUtility;
/**
* Decodes XML string to PHP array.
*
* A dedicated set of node attributes is considered during conversion:
* - attribute "index" specifies the final node name which is used as key in the PHP array
* - attribute "type" specifies the node value type which is used for casting
* - attribute "base64" specifies the node value type being binary and requiring a
* base64-decoding
* These attributes were applied during encoding of the PHP array with XmlEncoder::encode().
*
* The node name "n{number}" is converted to a number-indexed array key "{number}".
*
* @internal still experimental
*/
class XmlDecoder
{
/**
* This method serves as a wrapper for decode() and is used to replace
* GeneralUtility::xml2array(), which returns an exception as a string instead of throwing it.
* In perspective, all uses of this method should be replaced by decode() and the exceptions
* should be handled locally.
*
* @param string $xml XML string
* @param XmlDecodingOptions|null $options Decoding configuration - see decode() for details
* @return array|string PHP array - or a string if the XML root node is empty or an exception
*/
public function decodeWithReturningExceptionAsString(
string $xml,
XmlDecodingOptions $options = null
): array|string {
try {
return $this->decode($xml, $options);
} catch (\Throwable $e) {
return $e->getMessage();
}
}
/**
* @param string $xml XML string
* @param XmlDecodingOptions|null $options Apply specific decoding configuration - Ignored node types, libxml2 options, ...
* @return array|string PHP array - or a string if the XML root node is empty
* @throws InvalidDataException
*/
public function decode(
string $xml,
XmlDecodingOptions $options = null
): array|string {
$xml = trim($xml);
if ($xml === '') {
throw new InvalidDataException(
'Invalid XML data, it can not be empty.',
1630773210
);
}
$options = $options ?? new XmlDecodingOptions();
$xml = $this->disableNamespaceInNodeNames($xml);
$internalErrors = libxml_use_internal_errors(true);
libxml_clear_errors();
$dom = new \DOMDocument();
$dom->loadXML($xml, $options->getLoadOptions());
libxml_use_internal_errors($internalErrors);
if ($error = libxml_get_last_error()) {
libxml_clear_errors();
throw new InvalidDataException(
'Line ' . $error->line . ': ' . xml_error_string($error->code),
1630773230
);
}
$rootNode = null;
foreach ($dom->childNodes as $child) {
if (\XML_DOCUMENT_TYPE_NODE === $child->nodeType) {
throw new InvalidDataException(
'Document types are not allowed.',
1630773261
);
}
if (in_array($child->nodeType, $options->getIgnoredNodeTypes(), true)) {
continue;
}
$rootNode = $child;
break;
}
if ($rootNode === null) {
throw new InvalidDataException(
'Root node cannot be determined.',
1630773276
);
}
$rootNodeName = $this->reactivateNamespaceInNodeNames($rootNode->nodeName);
if (!$rootNode->hasChildNodes()) {
if ($options->includeRootNode()) {
$result = [$rootNodeName => $rootNode->nodeValue];
} else {
$result = $rootNode->nodeValue;
}
} else {
if ($options->includeRootNode()) {
$result = [$rootNodeName => $this->parseXml($rootNode, $options)];
} else {
$result = $this->parseXml($rootNode, $options);
}
}
if ($options->returnRootNodeName() && is_array($result)) {
$result['_DOCUMENT_TAG'] = $rootNodeName;
}
return $result;
}
/**
* DOMDocument::loadXML() breaks if prefixes of undefined namespaces are used in node names:
* Replace namespace divider ":" by temporary "___" string before parsing the XML.
*/
protected function disableNamespaceInNodeNames(string $value): string
{
return preg_replace(
['#<([/]?)([[:alnum:]_-]*):([[:alnum:]_-]*)([ >]?)#'],
['<$1$2___$3$4'],
$value
);
}
/**
* Re-insert the namespace divider again after parsing the XML.
*/
protected function reactivateNamespaceInNodeNames(string $value): string
{
return str_replace('___', ':', $value);
}
protected function parseXml(\DOMNode $node, XmlDecodingOptions $options): array|string|null
{
if (!$node->hasChildNodes()) {
return $node->nodeValue;
}
if (1 === $node->childNodes->length
&& in_array($node->firstChild->nodeType, [\XML_TEXT_NODE, \XML_CDATA_SECTION_NODE])
) {
return $node->firstChild->nodeValue;
}
$result = [];
foreach ($node->childNodes as $child) {
if (in_array($child->nodeType, $options->getIgnoredNodeTypes(), true)) {
continue;
}
$value = $this->parseXml($child, $options);
if ($child instanceof \DOMElement && $child->hasAttribute('index')) {
$key = $child->getAttribute('index');
} else {
$key = $this->reactivateNamespaceInNodeNames($child->nodeName);
if ($options->hasNamespacePrefix()
&& str_starts_with($key, $options->getNamespacePrefix())
) {
$key = substr($key, strlen($options->getNamespacePrefix()));
}
if (str_starts_with($key, 'n')
&& MathUtility::canBeInterpretedAsInteger($index = substr($key, 1))
) {
$key = (int)$index;
}
}
if ($child instanceof \DOMElement && $child->hasAttribute('base64')) {
$value = base64_decode($value);
} elseif ($child instanceof \DOMElement && $child->hasAttribute('type')) {
switch ($child->getAttribute('type')) {
case 'integer':
$value = (int)$value;
break;
case 'double':
$value = (double)$value;
break;
case 'boolean':
$value = (bool)$value;
break;
case 'NULL':
$value = null;
break;
case 'array':
$value = is_array($value) ? $value : (empty($value) ? [] : (array)$value);
break;
}
}
$result[$key] = $value;
}
return $result;
}
}
<?php
declare(strict_types=1);
/*
* This file is part of the TYPO3 CMS project.
*
* It is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, either version 2
* of the License, or any later version.
*
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*
* The TYPO3 project - inspiring people to share!
*/
namespace TYPO3\CMS\Core\Encoder;
/**
* @internal still experimental
*/
class XmlDecodingOptions
{
public const INCLUDE_ROOT_NODE = 'include_root_node';
public const IGNORED_NODE_TYPES = 'ignored_node_types';
public const LOAD_OPTIONS = 'load_options';
public const NAMESPACE_PREFIX = 'namespace_prefix';
public const RETURN_ROOT_NODE_NAME = 'return_root_node_name';
protected array $options = [
// Ignore XML node types when converting to a PHP array.
self::IGNORED_NODE_TYPES => [\XML_PI_NODE, \XML_COMMENT_NODE],
// Use the XML root node or its children as the first level of the PHP array.
self::INCLUDE_ROOT_NODE => false,
// Apply these libxml2 options when loading the XML.
self::LOAD_OPTIONS => \LIBXML_NONET | \LIBXML_NOBLANKS,
// Remove this XML namespace from each XML node, for example "T3:".
self::NAMESPACE_PREFIX => '',
// Append the name of the XML root node to the PHP array key "_DOCUMENT_TAG".
self::RETURN_ROOT_NODE_NAME => false,
];
public function __construct(array $options = [])
{
$this->options = array_merge($this->options, $options);
}
public function getLoadOptions(): int
{
return $this->options[self::LOAD_OPTIONS];
}
public function getIgnoredNodeTypes(): array
{
return $this->options[self::IGNORED_NODE_TYPES];
}
public function includeRootNode(): bool
{
return $this->options[self::INCLUDE_ROOT_NODE];
}
public function hasNamespacePrefix(): bool
{
return $this->options[self::NAMESPACE_PREFIX] !== '';
}
public function getNamespacePrefix(): string
{
return $this->options[self::NAMESPACE_PREFIX];
}
public function returnRootNodeName(): bool
{
return $this->options[self::RETURN_ROOT_NODE_NAME];
}
}
<?php
declare(strict_types=1);
/*
* This file is part of the TYPO3 CMS project.
*
* It is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, either version 2
* of the License, or any later version.
*
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*
* The TYPO3 project - inspiring people to share!
*/
namespace TYPO3\CMS\Core\Encoder;
use TYPO3\CMS\Core\Utility\MathUtility;
/**
* Encodes PHP array to XML string.
*
* A dedicated set of entry properties is stored in XML during conversion:
* - XML node attribute "index" stores original entry key if XML node name differs from entry
* key
* - XML node attribute "type" stores entry value type ("bool", "int", "double", ...)
* - XML node attribute "base64" specifies if entry value is binary (for example an image)
* These attributes are interpreted during decoding of the XML string with XmlDecoder::decode().
*
* Specific encoding configuration can be set by $additionalOptions - for the full array or array paths.
* For example
* ```php
* $input = [
* 'numeric' => [
* 'value1',
* 'value2'
* ],
* 'numeric-n-index' => [
* 'value1',
* 'value2'
* ],
* 'nested' => [
* 'node1' => 'value1',
* 'node2' => [
* 'node' => 'value'
* ]
* ]
* ];
* $additionalOptions = [
* 'useIndexTagForNum' => 'numbered-index'
* 'alt_options' => [
* '/numeric-n-index' => [
* 'useNindex' => true
* ],
* '/nested' => [
* 'useIndexTagForAssoc' => 'nested-outer',
* 'clearStackPath' => true,
* 'alt_options' => [
* '/nested-outer' => [
* 'useIndexTagForAssoc' => 'nested-inner'
* ]
* ]
* ]
* ]
* ];
* ```
* =>
* ```xml
* <phparray>
* <numeric type="array">
* <numbered-index index="0">value1</numbered-index>
* <numbered-index index="1">value2</numbered-index>
* </numeric>
* <numeric-n-index type="array">
* <n0>value1</n0>
* <n1>value2</n1>
* </numeric-n-index>
* <nested type="array">
* <nested-outer index="node1">value1</nested-outer>
* <nested-outer index="node2" type="array">
* <nested-inner index="node">value</nested-inner>
* </nested-outer>
* </nested>
* </phparray>
* ```
* Available options are:
* - grandParentTagMap[grandParentTagName/parentTagName] [string]
* Convert array key X to XML node name "{grandParentTagMap}" with node attribute "index=X"
* - if grand-parent is "{grandParentTagName}" and parent node is "{parentTagName}".
* - parentTagMap[parentTagName:_IS_NUM] [string]
* Convert array key X to XML node name "{parentTagMap}" with node attribute "index=X"
* - if parent node is "{parentTagName}" and current node is number-indexed.
* - parentTagMap[parentTagName:nodeName] [string]
* Convert array key X to XML node name "{parentTagMap}" with node attribute "index=X"
* - if parent node is "{parentTagName}" and current node is "{nodeName}".
* - parentTagMap[parentTagName] [string]
* Convert array key X to XML node name "{parentTagMap}" with node attribute "index=X"
* - if parent node is "{parentTagName}".
* - useNindex [bool]
* Convert number-indexed array key X to XML node name "nX".
* - useIndexTagForNum [string]
* Convert number-indexed array key X to XML node name "{useIndexTagForNum}" with node
* attribute "index=X".
* - useIndexTagForAssoc [string]
* Convert associative array key X to XML node name "{useIndexTagForAssoc}" with node
* attribute "index=X".
* - disableTypeAttrib [bool|int]
* Disable node attribute "type" for all value types
* (true = disable for all except arrays, 2 = disable for all).
* - useCDATA [bool]
* Wrap node value with <![CDATA[{node value}]]> - if text contains special characters.
* - alt_options[/.../nodeName] [array]
* Set new options for specific array path.
* - clearStackPath [bool]
* Resetting internal counter when descending the array hierarchy: Allows using relative
* array path in nested "alt_options" instead of absolute path.
*
* @internal still experimental
*/
class XmlEncoder
{
/**
* This method serves as a wrapper for encode() and is used to replace
* GeneralUtility::array2xml(), which returns an exception as a string instead of throwing it.
* In perspective, all uses of this method should be replaced by encode() and the exceptions
* should be handled locally.
*
* @param array $input PHP array
* @param XmlEncodingOptions|null $options Encoding configuration - see encode() for details
* @param array $additionalOptions Encoding options - see encode() for details
* @return string XML or exception
*/
public function encodeWithReturningExceptionAsString(
array $input,
XmlEncodingOptions $options = null,
array $additionalOptions = []
): string {
try {
return $this->encode($input, $options, $additionalOptions);
} catch (\Throwable $e) {
return $e->getMessage();
}
}
/**
* @param array $input PHP array
* @param XmlEncodingOptions|null $options Apply specific encoding configuration - XML format, namespace prefix and root node name
* @param array $additionalOptions Apply specific encoding options - for the full array or specific array paths.
* @return string XML string
*/
public function encode(
array $input,
XmlEncodingOptions $options = null,
array $additionalOptions = []
): string {
$options = $options ?? new XmlEncodingOptions();
return $this->parseArray(
$input,
$options,
$additionalOptions
);
}
protected function parseArray(
array $input,
XmlEncodingOptions $options,
array $additionalOptions,
int $level = 0,
array $stackData = []
): string {
$xml = '';
$rootNodeName = $options->getRootNodeName();
if (empty($rootNodeName)) {
$indentation = str_repeat($options->getIndentationStep(), $level);
} else {
$indentation = str_repeat($options->getIndentationStep(), $level + 1);
}
foreach ($input as $key => $value) {
// Construct the node name + attributes
$nodeName = $key = (string)$key;
$nodeAttributes = '';
if (isset(
$stackData['grandParentTagName'],
$stackData['parentTagName'],
$additionalOptions['grandParentTagMap'][$stackData['grandParentTagName'] . '/' . $stackData['parentTagName']]
)) {
// ... based on grand-parent + parent node name
$nodeName = (string)$additionalOptions['grandParentTagMap'][$stackData['grandParentTagName'] . '/' . $stackData['parentTagName']];
$nodeAttributes = ' index="' . htmlspecialchars($key) . '"';
} elseif (isset(
$stackData['parentTagName'],
$additionalOptions['parentTagMap'][$stackData['parentTagName'] . ':_IS_NUM']
) && MathUtility::canBeInterpretedAsInteger($nodeName)
) {
// ... based on parent node name + if current node name is numeric
$nodeName = (string)$additionalOptions['parentTagMap'][$stackData['parentTagName'] . ':_IS_NUM'];
$nodeAttributes = ' index="' . htmlspecialchars($key) . '"';
} elseif (isset(
$stackData['parentTagName'],
$additionalOptions['parentTagMap'][$stackData['parentTagName'] . ':' . $nodeName]
)) {
// ... based on parent node name + current node name
$nodeName = (string)$additionalOptions['parentTagMap'][$stackData['parentTagName'] . ':' . $nodeName];
$nodeAttributes = ' index="' . htmlspecialchars($key) . '"';
} elseif (isset(
$stackData['parentTagName'],
$additionalOptions['parentTagMap'][$stackData['parentTagName']]
)) {
// ... based on parent node name
$nodeName = (string)$additionalOptions['parentTagMap'][$stackData['parentTagName']];
$nodeAttributes = ' index="' . htmlspecialchars($key) . '"';
} elseif (MathUtility::canBeInterpretedAsInteger($nodeName)) {
// ... if current node name is numeric
if ($additionalOptions['useNindex'] ?? false) {
$nodeName = 'n' . $nodeName;
} else {
$nodeName = ($additionalOptions['useIndexTagForNum'] ?? false) ?: 'numIndex';
$nodeAttributes = ' index="' . $key . '"';
}
} elseif (!empty($additionalOptions['useIndexTagForAssoc'])) {
// ... if current node name is string
$nodeName = $additionalOptions['useIndexTagForAssoc'];
$nodeAttributes = ' index="' . htmlspecialchars($key) . '"';
}
$nodeName = $this->cleanUpNodeName($nodeName);
// Construct the node value
if (is_array($value)) {
// ... if has sub elements
if (isset($additionalOptions['alt_options'])
&& ($additionalOptions['alt_options'][($stackData['path'] ?? '') . '/' . $nodeName] ?? false)
) {
$subOptions = $additionalOptions['alt_options'][($stackData['path'] ?? '') . '/' . $nodeName];
$clearStackPath = (bool)($subOptions['clearStackPath'] ?? false);
} else {
$subOptions = $additionalOptions;
$clearStackPath = false;
}
if (empty($value)) {
$nodeValue = '';
} else {
$nodeValue = $options->getNewlineChar();
$nodeValue .= $this->parseArray(
$value,
$options,
$subOptions,
$level + 1,
[
'parentTagName' => $nodeName,
'grandParentTagName' => $stackData['parentTagName'] ?? '',
'path' => $clearStackPath ? '' : ($stackData['path'] ?? '') . '/' . $nodeName,
]
);
$nodeValue .= $indentation;
}
// Dropping the "type=array" attribute makes the XML prettier, but means that empty
// arrays are not restored with XmlDecoder::decode().
if (($additionalOptions['disableTypeAttrib'] ?? false) !== 2) {
$nodeAttributes .= ' type="array"';
}
} else {
// ... if is simple value
if ($this->isBinaryValue($value)) {
$nodeValue = $options->getNewlineChar() . chunk_split(base64_encode($value));
$nodeAttributes .= ' base64="1"';
} else {
$type = gettype($value);
if ($type === 'string') {
$nodeValue = htmlspecialchars($value);
if (($additionalOptions['useCDATA'] ?? false) && $nodeValue !== $value) {
$nodeValue = '<![CDATA[' . $value . ']]>';
}
} else {
$nodeValue = $value;
if (($additionalOptions['disableTypeAttrib'] ?? false) === false) {