Fixed bug #13858: IS cannot not index files if absRefPrefix is set and indexExternalU...
authorDmitry Dulepov <dmitry.dulepov@gmail.com>
Mon, 31 May 2010 08:35:37 +0000 (08:35 +0000)
committerDmitry Dulepov <dmitry.dulepov@gmail.com>
Mon, 31 May 2010 08:35:37 +0000 (08:35 +0000)
git-svn-id: https://svn.typo3.org/TYPO3v4/Core/trunk@7780 709f56b5-9817-0410-a4d7-c38de5d9e867

ChangeLog
typo3/sysext/indexed_search/class.indexer.php
typo3/sysext/indexed_search/tests/tx_indexedsearch_indexer_testcase.php [new file with mode: 0644]

index d05e0d0..f625ba9 100755 (executable)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2010-05-31  Dmitry Dulepov  <dmitry.dulepov@gmail.com>
+
+       * Fixed bug #13858: IS cannot not index files if absRefPrefix is set and indexExternalURLs is not (thanks to Christian Kuhn and Steffen Ritter for help!)
+
 2010-05-31  Benjamin Mack  <benni@typo3.org>
 
        * Fixed bug #13138: Add hook for manipulating content in felogin extension (Thanks to Thomas Layh)
index ad73f7e..8054b5c 100755 (executable)
@@ -788,7 +788,7 @@ class tx_indexedsearch_indexer {
                                $qParts = parse_url($linkSource);       // parse again due to new linkSource!
                        }
 
-                       if ($qParts['scheme'])  {
+                       if (!$linkInfo['localPath'] && $qParts['scheme']) {
                                if ($this->indexerConfig['indexExternalURLs'])  {
                                                // Index external URL (http or otherwise)
                                        $this->indexExternalUrl($linkSource);
@@ -839,46 +839,34 @@ class tx_indexedsearch_indexer {
        }
 
        /**
-        * Extracts all links to external documents from content string.
+        * Extracts all links to external documents from the HTML content string
         *
-        * @param       string          Content to analyse
-        * @return      array           Array of hyperlinks
+        * @param string $html
+        * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
         * @see extractLinks()
         */
-       function extractHyperLinks($string)     {
-               if (!is_object($this->htmlParser))      {
-                       $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
-               }
-
-               $parts = $this->htmlParser->splitTags('a',$string);
-               $list = array();
-               foreach ($parts as $k => $v)    {
-                       if ($k%2)       {
-                               $params = $this->htmlParser->get_tag_attributes($v,1);
-                               $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
-
-                               switch (strtolower($firstTagName))      {
-                                       case 'a':
-                                               $src = $params[0]['href'];
-                                               if ($src)       {
-                                                               // Check if a local path to that file has been set - useful if you are using a download script.
-                                                       $md5 = t3lib_div::shortMD5($src);
-                                                       if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles']))  {
-                                                               $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';
-                                                       } else $localPath=false;
-
-                                                       $list[] = array(
-                                                               'tag' => $v,
-                                                               'href' => $params[0]['href'],
-                                                               'localPath' => $localPath
+       function extractHyperLinks($html)       {
+               $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
+               $htmlParts = $htmlParser->splitTags('a', $html);
+               $hyperLinksData = array();
+               foreach ($htmlParts as $index => $tagData) {
+                       if (($index % 2) !== 0) {
+                               $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
+                               $firstTagName = $htmlParser->getFirstTagName($tagData);
+
+                               if (strtolower($firstTagName) == 'a') {
+                                       if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
+                                               $hyperLinksData[] = array(
+                                                       'tag' => $tagData,
+                                                       'href' => $tagAttributes[0]['href'],
+                                                       'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
                                                        );
                                                }
-                                       break;
                                }
                        }
                }
 
-               return $list;
+               return $hyperLinksData;
        }
 
        /**
@@ -887,37 +875,26 @@ class tx_indexedsearch_indexer {
         * @param       string          Content to analyze
         * @return      string          The base href or an empty string if not found
         */
-       public function extractBaseHref($string) {
-               if (!is_object($this->htmlParser)) {
-                       $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
-               }
-
-               $parts = $this->htmlParser->splitTags('base', $string);
-               foreach ($parts as $key => $value) {
-                       if ($key % 2) {
-                               $params = $this->htmlParser->get_tag_attributes($value, 1);
-                               $firstTagName = $this->htmlParser->getFirstTagName($value); // The 'name' of the first tag
-
-                               switch (strtolower($firstTagName)) {
-                                       case 'base':
-                                               $href = $params[0]['href'];
+       public function extractBaseHref($html) {
+               $href = '';
+               $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
+               $htmlParts = $htmlParser->splitTags('base', $html);
+               foreach ($htmlParts as $index => $tagData) {
+                       if (($index % 2) !== 0) {
+                               $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
+                               $firstTagName = $htmlParser->getFirstTagName($tagData);
+                               if (strtolower($firstTagName) == 'base') {
+                                       $href = $tagAttributes[0]['href'];
                                                if ($href) {
-                                                               // Return the first "base href" found (a single one should be present anyway)
-                                                       return $href;
+                                               break;
                                                }
                                }
                        }
                }
 
-               return '';
+               return $href;
        }
 
-
-
-
-
-
-
        /******************************************
         *
         * Indexing; external URL
@@ -985,15 +962,155 @@ class tx_indexedsearch_indexer {
 
 
 
+       /**
+        * Checks if the file is local
+        *
+        * @param $sourcePath
+        * @return string Absolute path to file if file is local, else empty string
+        */
+       protected function createLocalPath($sourcePath) {
+               $localPath = '';
+               static $pathFunctions = array(
+                       'createLocalPathFromT3vars',
+                       'createLocalPathUsingAbsRefPrefix',
+                       'createLocalPathUsingDomainURL',
+                       'createLocalPathFromAbsoluteURL',
+                       'createLocalPathFromRelativeURL'
+                       );
+               foreach ($pathFunctions as $functionName) {
+                       $localPath = $this->$functionName($sourcePath);
+                       if ($localPath != '') {
+                               break;
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path from T3VARs. This is useful for
+        * various download extensions that hide actual file name but still want the
+        * file to be indexed.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathFromT3vars($sourcePath) {
+               $localPath = '';
+               $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
+               if (is_array($indexLocalFiles)) {
+                       $md5 = t3lib_div::shortMD5($sourcePath);
+                       // Note: not using self::isAllowedLocalFile here because this method
+                       // is allowed to index files outside of the web site (for example,
+                       // protected downloads)
+                       if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
+                               $localPath = $indexLocalFiles[$md5];
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path by matching a current request URL.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathUsingDomainURL($sourcePath) {
+               $localPath = '';
+               $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
+               $baseURLLength = strlen($baseURL);
+               if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
+                       $sourcePath = substr($sourcePath, $baseURLLength);
+                       $localPath = PATH_site . $sourcePath;
+                       if (!self::isAllowedLocalFile($localPath)) {
+                               $localPath = '';
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path by matching absRefPrefix. This
+        * requires TSFE. If TSFE is missing, this function does nothing.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
+               $localPath = '';
+               if ($GLOBALS['TSFE'] instanceof tslib_fe) {
+                       $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
+                       $absRefPrefixLength = strlen($absRefPrefix);
+                       if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
+                               $sourcePath = substr($sourcePath, $absRefPrefixLength);
+                               $localPath = PATH_site . $sourcePath;
+                               if (!self::isAllowedLocalFile($localPath)) {
+                                       $localPath = '';
+                               }
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path from the absolute URL without
+        * schema.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathFromAbsoluteURL($sourcePath) {
+               $localPath = '';
+               if ($sourcePath{0} == '/') {
+                       $sourcePath = substr($sourcePath, 1);
+                       $localPath = PATH_site . $sourcePath;
+                       if (!self::isAllowedLocalFile($localPath)) {
+                               $localPath = '';
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Attempts to create a local file path from the relative URL.
+        *
+        * @param string $sourcePath
+        * @return string
+        */
+       protected function createLocalPathFromRelativeURL($sourcePath) {
+               $localPath = '';
+               if (self::isRelativeURL($sourcePath)) {
+                       $localPath = PATH_site . $sourcePath;
+                       if (!self::isAllowedLocalFile($localPath)) {
+                               $localPath = '';
+                       }
+               }
+               return $localPath;
+       }
 
+       /**
+        * Checks if URL is relative.
+        *
+        * @param string $url
+        * @return boolean
+        */
+       static protected function isRelativeURL($url) {
+               $urlParts = @parse_url($url);
+               return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
+       }
 
-
-
+       /**
+        * Checks if the path points to the file inside the web site
+        *
+        * @param string $filePath
+        * @return boolean
+        */
+       static protected function isAllowedLocalFile($filePath) {
+               $filePath = t3lib_div::resolveBackPath($filePath);
+               $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
+               $isFile = is_file($filePath);
+               return $insideWebPath && $isFile;
+       }
 
        /******************************************
         *
diff --git a/typo3/sysext/indexed_search/tests/tx_indexedsearch_indexer_testcase.php b/typo3/sysext/indexed_search/tests/tx_indexedsearch_indexer_testcase.php
new file mode 100644 (file)
index 0000000..ca5df21
--- /dev/null
@@ -0,0 +1,183 @@
+<?php
+/***************************************************************
+*  Copyright notice
+*
+*  (c) 2010 Dmitry Dulepov (dmitry.dulepov@gmail.com)
+*  All rights reserved
+*
+*  This script is part of the Typo3 project. The Typo3 project is
+*  free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2 of the License, or
+*  (at your option) any later version.
+*
+*  The GNU General Public License can be found at
+*  http://www.gnu.org/copyleft/gpl.html.
+*
+*  This script is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*
+*  This copyright notice MUST APPEAR in all copies of the script!
+***************************************************************/
+/**
+ * $Id$
+ *
+ */
+
+require_once(t3lib_extMgm::extPath('indexed_search', 'class.indexer.php'));
+
+/**
+  * This class contains unit tests for the indexer
+  *
+  * @author Dmitry Dulepov <dmitry.dulepov@gmail.com>
+  * @author Christian Kuhn <lolli@schwarzbu.ch>
+  * @package TYPO3
+  * @subpackage tx_indexedsearch
+  */
+class tx_indexedsearch_indexer_testcase extends tx_phpunit_testcase {
+
+       /**
+        * Indexer instance
+        *
+        * @var tx_indexedsearch_indexer
+        */
+       protected $indexer;
+
+       /**
+        * A name of the temporary file
+        *
+        * @var string
+        */
+       protected $temporaryFileName = '';
+
+       /**
+        * Sets up the test
+        *
+        * @return void
+        */
+       public function setUp() {
+               $this->indexer = t3lib_div::makeInstance('tx_indexedsearch_indexer');
+       }
+
+       /**
+        * Explicitly cleans up the indexer object to prevent any memory leaks
+        *
+        * @return void
+        */
+       public function tearDown() {
+               unset($this->indexer);
+               if ($this->temporaryFileName) {
+                       @unlink($this->temporaryFileName);
+               }
+       }
+
+       /**
+        * Checks that non-existing files are not returned
+        *
+        * @return void
+        */
+       public function testNonExistingLocalPath() {
+               $html = 'test <a href="' . md5(uniqid('')) . '">test</a> test';
+               $result = $this->indexer->extractHyperLinks($html);
+
+               $this->assertEquals(1, count($result), 'Wrong number of parsed links');
+               $this->assertEquals($result[0]['localPath'], '', 'Local path is incorrect');
+       }
+
+       /**
+        * Checks that using t3vars returns correct file
+        *
+        * @return void
+        */
+       public function testLocalPathWithT3Vars() {
+               $this->temporaryFileName = tempnam(sys_get_temp_dir(), 't3unit-');
+               $html = 'test <a href="testfile">test</a> test';
+               $savedValue = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
+               $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] = array(
+                       t3lib_div::shortMD5('testfile') => $this->temporaryFileName
+               );
+               $result = $this->indexer->extractHyperLinks($html);
+               $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] = $savedValue;
+
+               $this->assertEquals(1, count($result), 'Wrong number of parsed links');
+               $this->assertEquals($result[0]['localPath'], $this->temporaryFileName, 'Local path is incorrect');
+       }
+
+       /**
+        * Tests that a path with baseURL
+        *
+        * @return void
+        */
+       public function testLocalPathWithSiteURL() {
+               $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
+               $html = 'test <a href="' . $baseURL . 'index.php">test</a> test';
+               $result = $this->indexer->extractHyperLinks($html);
+
+               $this->assertEquals(1, count($result), 'Wrong number of parsed links');
+               $this->assertEquals($result[0]['localPath'], PATH_site . 'index.php', 'Local path is incorrect');
+       }
+
+       /**
+        * Tests absolute path
+        *
+        * @return void
+        */
+       public function testRelativeLocalPath() {
+               $html = 'test <a href="index.php">test</a> test';
+               $result = $this->indexer->extractHyperLinks($html);
+               $this->assertEquals(1, count($result), 'Wrong number of parsed links');
+               $this->assertEquals($result[0]['localPath'], PATH_site . 'index.php', 'Local path is incorrect');
+       }
+
+       /**
+        * Tests absolute path.
+        *
+        * @return void
+        */
+       public function testAbsoluteLocalPath() {
+               $path = substr(PATH_typo3, strlen(PATH_site) - 1);
+               $html = 'test <a href="' . $path . 'index.php">test</a> test';
+               $result = $this->indexer->extractHyperLinks($html);
+
+               $this->assertEquals(1, count($result), 'Wrong number of parsed links');
+               $this->assertEquals($result[0]['localPath'], PATH_typo3 . 'index.php', 'Local path is incorrect');
+       }
+
+       /**
+        * Tests that a path with the absRefPrefix returns correct result
+        *
+        * @return void
+        */
+       public function testLocalPathWithAbsRefPrefix() {
+               $absRefPrefix = '/' . md5(uniqid(''));
+               $html = 'test <a href="' . $absRefPrefix . 'index.php">test</a> test';
+               $savedPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
+               $GLOBALS['TSFE']->config['config']['absRefPrefix'] = $absRefPrefix;
+               $result = $this->indexer->extractHyperLinks($html);
+               $GLOBALS['TSFE']->config['config']['absRefPrefix'] = $savedPrefix;
+
+               $this->assertEquals(1, count($result), 'Wrong number of parsed links');
+               $this->assertEquals($result[0]['localPath'], PATH_site . 'index.php', 'Local path is incorrect');
+       }
+
+       /**
+        * Checks that base HREF is extracted correctly
+        *
+        * @return void
+        */
+       public function textExtractBaseHref() {
+               $baseHref = 'http://example.com/';
+               $html = '<html><head><Base Href="' . $baseHref . '" /></head></html>';
+               $result = $this->indexer->extractHyperLinks($html);
+
+               $this->assertEquals($baseHref, $result, 'Incorrect base href was extracted');
+       }
+}
+
+if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/tests/class.tx_indexedsearch_indexer_testcase.php'])   {
+       include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/tests/class.tx_indexedsearch_indexer_testcase.php']);
+}
+
+?>
\ No newline at end of file