Fixed bug #13732: External URL only indexes first page [indexed_search]
authorXavier Perseguers <typo3@perseguers.ch>
Tue, 13 Apr 2010 23:04:47 +0000 (23:04 +0000)
committerXavier Perseguers <typo3@perseguers.ch>
Tue, 13 Apr 2010 23:04:47 +0000 (23:04 +0000)
git-svn-id: https://svn.typo3.org/TYPO3v4/Core/trunk@7356 709f56b5-9817-0410-a4d7-c38de5d9e867

ChangeLog
typo3/sysext/indexed_search/class.crawler.php
typo3/sysext/indexed_search/class.indexer.php

index 36a939c..68ac8eb 100755 (executable)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,6 @@
 2010-04-14  Xavier Perseguers  <typo3@perseguers.ch>
 
+       * Fixed bug #13732: External URL only indexes first page [indexed_search]
        * Raised DBAL version from 1.1.1 to 1.1.2
 
 2010-04-13  Ernesto Baschny  <ernst@cron-it.de>
index 91fdf28..70d3010 100755 (executable)
@@ -641,6 +641,15 @@ class tx_indexedsearch_crawler {
                $indexerObj->indexExternalUrl($url);
                $url_qParts = parse_url($url);
 
+               $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
+               $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
+               if (!$baseHref) {
+                               // Extract base href from current URL
+                       $baseHref = $baseAbsoluteHref;
+                       $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
+               }
+               $baseHref = rtrim($baseHref, '/');
+
                        // Get URLs on this page:
                $subUrls = array();
                $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
@@ -653,7 +662,12 @@ class tx_indexedsearch_crawler {
 
                        $qParts = parse_url($subUrl);
                        if (!$qParts['scheme']) {
-                               $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
+                               $relativeUrl = t3lib_div::resolveBackPath($subUrl);
+                               if ($relativeUrl{0} === '/') {
+                                       $subUrl = $baseAbsoluteHref . $relativeUrl;
+                               } else {
+                                       $subUrl = $baseHref . '/' . $relativeUrl;
+                               }
                        }
 
                        $subUrls[] = $subUrl;
index 6681994..c49dd18 100755 (executable)
@@ -2,7 +2,7 @@
 /***************************************************************
 *  Copyright notice
 *
-*  (c) 2001-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
+*  (c) 2001-2010 Kasper Skaarhoj (kasperYYYY@typo3.com)
 *  All rights reserved
 *
 *  This script is part of the TYPO3 project. The TYPO3 project is
@@ -27,7 +27,7 @@
 /**
  * This class is a search indexer for TYPO3
  *
- * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
+ * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
  * Originally Christian Jul Jensen <christian@jul.net> helped as well.
  */
 /**
@@ -881,9 +881,36 @@ class tx_indexedsearch_indexer {
                return $list;
        }
 
+       /**
+        * Extracts the "base href" from content string.
+        *
+        * @param       string          Content to analyze
+        * @return      string          The base href or an empty string if not found
+        */
+       public function extractBaseHref($string) {
+               if (!is_object($this->htmlParser)) {
+                       $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
+               }
 
+               $parts = $this->htmlParser->splitTags('base', $string);
+               foreach ($parts as $key => $value) {
+                       if ($key % 2) {
+                               $params = $this->htmlParser->get_tag_attributes($value, 1);
+                               $firstTagName = $this->htmlParser->getFirstTagName($value); // The 'name' of the first tag
+
+                               switch (strtolower($firstTagName)) {
+                                       case 'base':
+                                               $href = $params[0]['href'];
+                                               if ($href) {
+                                                               // Return the first "base href" found (a single one should be present anyway)
+                                                       return $href;
+                                               }
+                               }
+                       }
+               }
 
-
+               return '';
+       }