[BUGFIX] Links on external pages don't get indexed 90/6990/10
authorMario Rimann <mario.rimann@typo3.org>
Fri, 5 Apr 2013 18:48:24 +0000 (20:48 +0200)
committerGeorg Ringer <georg.ringer@gmail.com>
Tue, 16 Jul 2013 11:44:35 +0000 (13:44 +0200)
Allows the crawler to start indexing a specific file like
www.domain.tld/foobar.html instead of just www.domain.tld/

This is just about the comparison against the base URL and
enables the Crawler to start crawling at e.g. a file that contains
a manually generated list of links to follow. Before that change,
even links to targets on the same domain were rejected by
the checkUrl() method in case the base Url was pointing to some
file instead of "/". This was because the base URL was then not
part of the target URL.
After stripping off any path from the base URL for this comparison
this can now also be used to start crawling from a file.

Change-Id: I2727a9a447754b88d2c279c24b32b5c3a2df26c0
Resolves: #16534
Releases: 6.2, 6.1, 6.0, 4.7, 4.5
Reviewed-on: https://review.typo3.org/6990
Reviewed-by: Michael Stucki
Tested-by: Michael Stucki
Reviewed-by: Georg Ringer
Tested-by: Georg Ringer
typo3/sysext/indexed_search/Classes/Hook/CrawlerHook.php
typo3/sysext/indexed_search/Tests/Unit/Hook/CrawlerHookTest.php [new file with mode: 0644]

index 7daaee0..ff36b6d 100644 (file)
@@ -462,16 +462,28 @@ class CrawlerHook {
         *
         *****************************************/
        /**
-        * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
+        * Check if an input URL are allowed to be indexed. Depends on whether the URL
+        * to be indexed is part of (= starts with) the same base URL and that this
+        * URl is not yet present in the url log.
+        *
+        * To check if the URL is on the same domain, any path after the domain name
+        * is stripped of for the comparison.
         *
         * @param       string          URL string to check
         * @param       array           Array of already indexed URLs (input url is looked up here and must not exist already)
-        * @param       string          Base URL of the indexing process (input URL must be "inside" the base URL!)
-        * @return      string          Returls the URL if OK, otherwise FALSE
+        * @param       string          Base URL of the indexing process (input URL must be "inside" the base URL!). If the base URL is pointing to a file, the path to the file is stripped off for checking.
+        * @return      mixed           Returns the URL if OK, otherwise FALSE
         * @todo Define visibility
         */
        public function checkUrl($url, $urlLog, $baseUrl) {
                $url = preg_replace('/\\/\\/$/', '/', $url);
+
+               // just get the root of the URL like http://www.domain.tld/ to verify
+               // the URL to be indexed is part of the same domain
+               $baseUrlArray = parse_url($baseUrl);
+               $baseUrl = $baseUrlArray['scheme'] . '://' . $baseUrlArray['host'] .
+                       ($baseUrlArray['port'] ? ':' . $baseUrlArray['port'] : '') . '/';
+
                list($url) = explode('#', $url);
                if (!strstr($url, '../')) {
                        if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
@@ -480,6 +492,8 @@ class CrawlerHook {
                                }
                        }
                }
+
+               return FALSE;
        }
 
        /**
diff --git a/typo3/sysext/indexed_search/Tests/Unit/Hook/CrawlerHookTest.php b/typo3/sysext/indexed_search/Tests/Unit/Hook/CrawlerHookTest.php
new file mode 100644 (file)
index 0000000..582bcba
--- /dev/null
@@ -0,0 +1,111 @@
+<?php
+namespace TYPO3\CMS\IndexedSearch\Tests\Unit\Hook;
+
+/***************************************************************
+ *  Copyright notice
+ *
+ *  (c) 2013 Mario Rimann (mario.rimann@typo3.org)
+ *  All rights reserved
+ *
+ *  This script is part of the TYPO3 project. The TYPO3 project is
+ *  free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  The GNU General Public License can be found at
+ *  http://www.gnu.org/copyleft/gpl.html.
+ *
+ *  This script is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  This copyright notice MUST APPEAR in all copies of the script!
+ ***************************************************************/
+
+/**
+ * This class contains unit tests for Crawler Hook class
+ *
+ * @author Mario Rimann <mario.rimann@typo3.org>
+ */
+class CrawlerHookTest extends \TYPO3\CMS\Core\Tests\UnitTestCase {
+
+       /**
+        * CrawlerHook instance
+        *
+        * @var \PHPUnit_Framework_MockObject_MockObject|\TYPO3\CMS\IndexedSearch\Hook\CrawlerHook
+        */
+       protected $fixture = NULL;
+
+
+       /**
+        * Sets up the test
+        */
+       public function setUp() {
+               $this->fixture = $this->getMock('TYPO3\CMS\IndexedSearch\Hook\CrawlerHook', array('dummy'));
+       }
+
+       /**
+        * Explicitly clean up the indexer object to prevent any memory leaks
+        */
+       public function tearDown() {
+               $this->fixture = NULL;
+       }
+
+       /**
+        * @test
+        */
+       public function checkUrlReturnsTheUrlOnBaseUrlWithoutPath() {
+               $this->assertSame(
+                       'http://typo3.org/about.html',
+                       $this->fixture->checkUrl(
+                               'http://typo3.org/about.html',
+                               array(),
+                               'http://typo3.org/'
+                       )
+               );
+       }
+
+       /**
+        * @test
+        */
+       public function checkUrlReturnsTheUrlOnBaseUrlWithPath() {
+               $this->assertSame(
+                       'http://typo3.org/about.html',
+                       $this->fixture->checkUrl(
+                               'http://typo3.org/about.html',
+                               array(),
+                               'http://typo3.org/linklist.html'
+                       )
+               );
+       }
+
+       /**
+        * @test
+        */
+       public function checkUrlReturnsFalseOnUrlThatDoesNotMatchToTheBaseUrlOnBaseUrlWithoutPath() {
+               $this->assertFalse(
+                       $this->fixture->checkUrl(
+                               'http://www.w3c.org/about.html',
+                               array(),
+                               'http://typo3.org/'
+                       )
+               );
+       }
+
+       /**
+        * @test
+        */
+       public function checkUrlReturnsFalseOnUrlThatDoesNotMatchToTheBaseUrlOnBaseUrlWithPath() {
+               $this->assertFalse(
+                       $this->fixture->checkUrl(
+                               'http://www.w3c.org/about.html',
+                               array(),
+                               'http://typo3.org/linklist.html'
+                       )
+               );
+       }
+
+}
+?>
\ No newline at end of file