[BUGFIX] isValidUrl() idna converts whole URI 36/25636/26
authorMichiel Roos <michiel@maxserv.nl>
Fri, 22 Nov 2013 11:06:14 +0000 (12:06 +0100)
committerXavier Perseguers <xavier@typo3.org>
Sat, 21 Dec 2013 14:50:42 +0000 (15:50 +0100)
GeneralUtility::isValidUrl() idna converts whole URI instead of
domain only.

The expensive idna_convert() is called from isValidUrl(). Instead of
feeding it just the domain part, the whole URI is converted.

When supplying just the domain part, a great speed gain can be seen.

On seriously malformed URLs, parse_url() may return FALSE and emits an
E_WARNING. So we check for that first.

PHP no longer supports the flags FILTER_FLAG_HOST_REQUIRED and
FILTER_FLAG_SCHEME_REQUIRED. A scheme is now required by default. [1]
Return FALSE if the URL does not start with a scheme.

A public GeneralUtility::idnaEncode() method uses a static idna_convert
instance and fetches converted strings from a string cache array
to avoid multiple checks on the same domain.

All manual idna_convert instances are replaced with
GeneralUtility::idnaEncode() calls.

Special characters are not allowed in the URL except in the domain
part [2]. So the test with special characters in the path was removed
from the GeneralUtilityTest class.

[1] http://www.php.net/manual/en/filter.filters.flags.php#107382
[2] http://tools.ietf.org/html/rfc3986#appendix-A

Change-Id: I7a0ab0a399d9d6cf68c824f413be6b6d621947c1
Resolves: #53862
Releases: 6.2, 6.1, 6.0
Reviewed-on: https://review.typo3.org/25636
Reviewed-by: Markus Klein
Reviewed-by: Wouter Wolters
Tested-by: Wouter Wolters
Reviewed-by: Michiel Roos
Tested-by: Michiel Roos
Tested-by: Markus Klein
Reviewed-by: Andreas Wolf
Reviewed-by: Jo Hasenau
Reviewed-by: Xavier Perseguers
Tested-by: Xavier Perseguers
typo3/sysext/core/Classes/DataHandling/DataHandler.php
typo3/sysext/core/Classes/Utility/GeneralUtility.php
typo3/sysext/core/Tests/Unit/Utility/GeneralUtilityTest.php
typo3/sysext/frontend/Classes/ContentObject/ContentObjectRenderer.php

index 83d297a..3fdf322 100644 (file)
@@ -2504,12 +2504,8 @@ class DataHandler {
                                        $value = preg_replace('/[^a-zA-Z0-9_-]/', '', $value);
                                        break;
                                case 'domainname':
-                                       if (!preg_match('/^[a-z0-9\\.\\-]*$/i', $value)) {
-                                               GeneralUtility::requireOnce(PATH_typo3 . 'contrib/idna/idna_convert.class.php');
-                                               $idnaConvert = new \idna_convert();
-                                               $idnaConvert->set_parameter('idn_version', '2008');
-                                               $value = $idnaConvert->encode($value);
-                                               unset($idnaConvert);
+                                       if (!preg_match('/^[a-z0-9.\\-]*$/i', $value)) {
+                                               $value = GeneralUtility::idnaEncode($value);
                                        }
                                        break;
                                default:
index d1509ae..28d42a1 100644 (file)
@@ -70,6 +70,20 @@ class GeneralUtility {
         */
        static protected $applicationContext = NULL;
 
+       /**
+        * IDNA string cache
+        *
+        * @var array<string>
+        */
+       static protected $idnaStringCache = array();
+
+       /**
+        * IDNA converter
+        *
+        * @var \idna_convert
+        */
+       static protected $idnaConverter = NULL;
+
        /*************************
         *
         * GET/POST Variables
@@ -1050,6 +1064,18 @@ class GeneralUtility {
        /**
         * Checking syntax of input email address
         *
+        * http://tools.ietf.org/html/rfc3696
+        * International characters are allowed in email. So the whole address needs
+        * to be converted to punicode before passing it to filter_var(). We convert
+        * the user- and domain part separately to increase the chance of hitting an
+        * entry in self::$idnaStringCache.
+        *
+        * Also the @ sign may appear multiple times in an address. If not used as
+        * a boundary marker between the user- and domain part, it must be escaped
+        * with a backslash: \@. This mean we can not just explode on the @ sign and
+        * expect to get just two parts. So we pop off the domain and then glue the
+        * rest together again.
+        *
         * @param string $email Input string to evaluate
         * @return boolean Returns TRUE if the $email address (input string) is valid
         */
@@ -1058,9 +1084,17 @@ class GeneralUtility {
                if (!is_string($email)) {
                        return FALSE;
                }
-               require_once PATH_typo3 . 'contrib/idna/idna_convert.class.php';
-               $IDN = new \idna_convert(array('idn_version' => 2008));
-               return filter_var($IDN->encode($email), FILTER_VALIDATE_EMAIL) !== FALSE;
+               $atPosition = strrpos($email, '@');
+               if (!$atPosition || $atPosition + 1 === strlen($email)) {
+                       // Return if no @ found or it is placed at the very beginning or end of the email
+                       return FALSE;
+               }
+               $domain = substr($email, $atPosition + 1);
+               $user = substr($email, 0, $atPosition);
+               if (!preg_match('/^[a-z0-9.\\-]*$/i', $domain)) {
+                       $domain = self::idnaEncode($domain);
+               }
+               return filter_var($user . '@' . $domain, FILTER_VALIDATE_EMAIL) !== FALSE;
        }
 
        /**
@@ -1239,6 +1273,25 @@ class GeneralUtility {
        }
 
        /**
+        * Returns an ASCII string (punicode) representation of $value
+        *
+        * @param string $value
+        * @return string An ASCII encoded (punicode) string
+        */
+       static public function idnaEncode($value) {
+               if (isset(self::$idnaStringCache[$value])) {
+                       return self::$idnaStringCache[$value];
+               } else {
+                       if (!self::$idnaConverter) {
+                               require_once PATH_typo3 . 'contrib/idna/idna_convert.class.php';
+                               self::$idnaConverter = new \idna_convert(array('idn_version' => 2008));
+                       }
+                       self::$idnaStringCache[$value] = self::$idnaConverter->encode($value);
+                       return self::$idnaStringCache[$value];
+               }
+       }
+
+       /**
         * Returns a hex representation of a random byte string.
         *
         * @param integer $count Number of hex characters to return
@@ -1298,13 +1351,48 @@ class GeneralUtility {
        /**
         * Checks if a given string is a Uniform Resource Locator (URL).
         *
+        * On seriously malformed URLs, parse_url may return FALSE and emit an
+        * E_WARNING.
+        *
+        * filter_var() requires a scheme to be present.
+        *
+        * http://www.faqs.org/rfcs/rfc2396.html
+        * Scheme names consist of a sequence of characters beginning with a
+        * lower case letter and followed by any combination of lower case letters,
+        * digits, plus ("+"), period ("."), or hyphen ("-").  For resiliency,
+        * programs interpreting URI should treat upper case letters as equivalent to
+        * lower case in scheme names (e.g., allow "HTTP" as well as "http").
+        * scheme = alpha *( alpha | digit | "+" | "-" | "." )
+        *
+        * Convert the domain part to punicode if it does not look like a regular
+        * domain name. Only the domain part because RFC3986 specifies the the rest of
+        * the url may not contain special characters:
+        * http://tools.ietf.org/html/rfc3986#appendix-A
+        *
         * @param string $url The URL to be validated
         * @return boolean Whether the given URL is valid
         */
        static public function isValidUrl($url) {
-               require_once PATH_typo3 . 'contrib/idna/idna_convert.class.php';
-               $IDN = new \idna_convert(array('idn_version' => 2008));
-               return filter_var($IDN->encode($url), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED) !== FALSE;
+               $parsedUrl = parse_url($url);
+               if (!$parsedUrl || !isset($parsedUrl['scheme'])) {
+                       return FALSE;
+               }
+               // HttpUtility::buildUrl() will always build urls with <scheme>://
+               // our original $url might only contain <scheme>: (e.g. mail:)
+               // so we convert that to the double-slashed version to ensure
+               // our check against the $recomposedUrl is proper
+               if (!self::isFirstPartOfStr($url, $parsedUrl['scheme'] . '://')) {
+                       $url = str_replace($parsedUrl['scheme'] . ':', $parsedUrl['scheme'] . '://', $url);
+               }
+               $recomposedUrl = HttpUtility::buildUrl($parsedUrl);
+               if ($recomposedUrl !== $url) {
+                       // The parse_url() had to modify characters, so the URL is invalid
+                       return FALSE;
+               }
+               if (isset($parsedUrl['host']) && !preg_match('/^[a-z0-9.\\-]*$/i', $parsedUrl['host'])) {
+                       $parsedUrl['host'] = self::idnaEncode($parsedUrl['host']);
+               }
+               return filter_var(HttpUtility::buildUrl($parsedUrl), FILTER_VALIDATE_URL) !== FALSE;
        }
 
        /*************************
index b36b79b..22f691b 100644 (file)
@@ -1027,7 +1027,6 @@ class GeneralUtilityTest extends \TYPO3\CMS\Core\Tests\UnitTestCase {
                        // 'dot in local part' => array('firstname.lastname@employee.2something.com'),
                        // Fix / change if TYPO3 php requirement changed: Address ok with 5.2.6, but not ok with 5.3.2
                        // 'dash as local part' => array('-@foo.com'),
-                       'umlauts in local part' => array('äöüfoo@bar.com'),
                        'umlauts in domain part' => array('foo@äöüfoo.com')
                );
        }
@@ -1072,6 +1071,7 @@ class GeneralUtilityTest extends \TYPO3\CMS\Core\Tests\UnitTestCase {
                        'dot at beginning of domain part' => array('test@.com'),
                        // Fix / change if TYPO3 php requirement changed: Address ok with 5.2.6, but not ok with 5.3.2
                        // 'local part ends with dot' => array('e.x.a.m.p.l.e.@example.com'),
+                       'umlauts in local part' => array('äöüfoo@bar.com'),
                        'trailing whitespace' => array('test@example.com '),
                        'trailing carriage return' => array('test@example.com' . CR),
                        'trailing linefeed' => array('test@example.com' . LF),
@@ -1721,9 +1721,9 @@ class GeneralUtilityTest extends \TYPO3\CMS\Core\Tests\UnitTestCase {
        /**
         * Data provider for valid isValidUrl's
         *
-        * @return array Valid ressource
+        * @return array Valid resource
         */
-       public function validUrlValidRessourceDataProvider() {
+       public function validUrlValidResourceDataProvider() {
                return array(
                        'http' => array('http://www.example.org/'),
                        'http without trailing slash' => array('http://qwe'),
@@ -1744,15 +1744,14 @@ class GeneralUtilityTest extends \TYPO3\CMS\Core\Tests\UnitTestCase {
                        'http punicode subdomain' => array('http://xn--h-zfa.oebb.at'),
                        'http domain-name umlauts' => array('http://www.öbb.at'),
                        'http subdomain umlauts' => array('http://äh.oebb.at'),
-                       'http directory umlauts' => array('http://www.oebb.at/äöü/')
                );
        }
 
        /**
         * @test
-        * @dataProvider validUrlValidRessourceDataProvider
+        * @dataProvider validUrlValidResourceDataProvider
         */
-       public function validURLReturnsTrueForValidRessource($url) {
+       public function validURLReturnsTrueForValidResource($url) {
                $this->assertTrue(Utility\GeneralUtility::isValidUrl($url));
        }
 
@@ -1775,7 +1774,8 @@ class GeneralUtilityTest extends \TYPO3\CMS\Core\Tests\UnitTestCase {
                        'empty string' => array(''),
                        'string -1' => array('-1'),
                        'string array()' => array('array()'),
-                       'random string' => array('qwe')
+                       'random string' => array('qwe'),
+                       'http directory umlauts' => array('http://www.oebb.at/äöü/'),
                );
        }
 
index 8b8c616..90529aa 100644 (file)
@@ -6120,11 +6120,8 @@ class ContentObjectRenderer {
                                                        }
                                                        $LD['target'] = $target;
                                                        // Convert IDNA-like domain (if any)
-                                                       if (!preg_match('/^[a-z0-9\\.\\-]*$/i', $targetDomain)) {
-                                                               require_once PATH_typo3 . 'contrib/idna/idna_convert.class.php';
-                                                               $IDN = new \idna_convert();
-                                                               $targetDomain = $IDN->encode($targetDomain);
-                                                               unset($IDN);
+                                                       if (!preg_match('/^[a-z0-9.\\-]*$/i', $targetDomain)) {
+                                                               $targetDomain =  GeneralUtility::idnaEncode($targetDomain);
                                                        }
                                                        $this->lastTypoLinkUrl = $this->URLqMark(($absoluteUrlScheme . '://' . $targetDomain . '/index.php?id=' . $page['uid']), $addQueryParams) . $sectionMark;
                                                } else {