[FEATURE] Enhance fulltext search 49/47849/2
authorFrancois Suter <francois@typo3.org>
Fri, 22 Apr 2016 11:07:46 +0000 (13:07 +0200)
committerFrancois Suter <francois@typo3.org>
Fri, 22 Apr 2016 11:10:17 +0000 (13:10 +0200)
Support more operators for boolean mode. Add relevant unit tests.

Resolves: #75041
Releases: 2.0
Change-Id: I3511c95b94e103ed55bb35ca03d0a2c7ea880cc2
Reviewed-on: https://review.typo3.org/47849
Reviewed-by: Francois Suter <francois@typo3.org>
Tested-by: Francois Suter <francois@typo3.org>
ChangeLog
Classes/Parser/FulltextParser.php
Documentation/Queries/Fulltext/Index.rst
Tests/Unit/SqlBuilderTest.php

index ebf35d2..85ecd5a 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,9 +1,22 @@
+2016-04-22 Francois Suter (Cobweb)  <typo3@cobweb.ch>
+
+       * Enhanced fulltext search capabilities, resolves #75041
+
+2016-04-17 Francois Suter (Cobweb)  <typo3@cobweb.ch>
+
+       * Fixed failing unit tests, resolves #75722
+
 2016-04-15 Francois Suter (Cobweb)  <typo3@cobweb.ch>
 
        * Applied PSR-2 formatting and other cleanups, resolves #75678
        * Moved language files to XLIFF, resolves #75685
        * Changed default query cache duration to 0, resolves #57138
 
+2016-03-14 Fabien Udriot  <fabien.udriot@ecodev.ch>
+
+       * Added composer file, resolves #75051
+       * Ensured compatibility with TYPO3 CMS 7 LTS, resolves #75052
+
 2015-09-27 Francois Suter (Cobweb)  <typo3@cobweb.ch>
 
        * Fixed crashing query check wizard, resolves #70136
index caac927..bef6317 100644 (file)
@@ -51,6 +51,11 @@ class FulltextParser
     protected $configuration;
 
     /**
+     * @var array List of allowed fulltext operators (see http://dev.mysql.com/doc/refman/5.6/en/fulltext-boolean.html)
+     */
+    static protected $fullTextOperators = array('+', '-', '~', '>', '<');
+
+    /**
      * Constructor
      *
      * @return FulltextParser
@@ -115,7 +120,7 @@ class FulltextParser
     public function parse($table, $index, $search, $isNaturalSearch, $isNegated)
     {
         $this->retrieveIndexedFields($table);
-        if (isset($this->indexedFields[$index])) {
+        if (array_key_exists($index, $this->indexedFields)) {
             $indexFields = $this->indexedFields[$index];
         } else {
             throw new InvalidQueryException(
@@ -123,11 +128,11 @@ class FulltextParser
                     1421769189
             );
         }
-        // Search terms from a query string will be urlencode'd
-        $processedSearchTerms = urldecode($search);
         $booleanMode = '';
-        if (!$isNaturalSearch) {
-            $processedSearchTerms = $this->processSearchTerm($processedSearchTerms);
+        if ($isNaturalSearch) {
+            $processedSearchTerms = addslashes($search);
+        } else {
+            $processedSearchTerms = $this->processSearchTerm($search);
             $booleanMode = ' IN BOOLEAN MODE';
         }
         if (empty($processedSearchTerms)) {
@@ -156,38 +161,55 @@ class FulltextParser
         $termsProcessed = array();
 
         // Handle double quote wrapping
-        if (preg_match_all('/".+"/isU', $term, $matches)) {
-
+        // Take all double-quoted strings and replace them with a ###EXTRACTED(number)### construct
+        // These terms are not processed further
+        // Terms within brackets are also not handled further
+        $searches = array();
+        $replacements = array();
+        if (preg_match_all('/["(].+[")]/isU', $term, $matches)) {
+            $counter = 1;
             foreach ($matches as $match) {
-                $searchedCharacters = array(
-                        '"',
-                        ' '
-                );
-                $replacedCharacters = array(
-                        '',
-                        '###'
-                );
-                $search = $match;
-                $replace = str_replace($searchedCharacters, $replacedCharacters, $match);
-                $term = str_replace($search, $replace, $term);
+                $searches[] = $match[0];
+                $replacements[] = '###EXTRACTED' . $counter . '###';
+                $counter++;
             }
+            $term = str_replace($searches, $replacements, $term);
         }
 
+        // Now that double-quoted and brackets-wrapped strings have been extracted,
+        // get each search term by splitting on spaces
         $terms = explode(' ', $term);
         foreach ($terms as $aTerm) {
-            if (!empty($aTerm)) {
-                // Handle exclusion of term
-                $logic = '+';
-                if (strpos($aTerm, '-') === 0) {
+            // Take extracted strings as is
+            if (strpos($aTerm, '###EXTRACTED') === 0) {
+                $termsProcessed[] = $aTerm;
+            } elseif (!empty($aTerm)) {
+                $operator = substr($aTerm, 0, 1);
+                $wildcard = substr($aTerm, -1);
+                if (in_array($operator, self::$fullTextOperators, true)) {
                     $aTerm = substr($aTerm, 1);
-                    $logic = '-';
+                } else {
+                    $operator = '';
+                }
+                if ($wildcard === '*') {
+                    $aTerm = substr($aTerm, 0, -1);
+                } else {
+                    $wildcard = '';
                 }
-                if (strlen($aTerm) >= $this->configuration['fullTextMinimumWordLength']) {
-                    $termProcessed = str_replace('###', ' ', addslashes($aTerm));
-                    $termsProcessed[] = sprintf('%s"%s"', $logic, $termProcessed);
+                // Eliminate search terms which are too short (except if wildcard is used)
+                if ($wildcard === '*' || strlen($aTerm) >= $this->configuration['fullTextMinimumWordLength']) {
+                    $termsProcessed[] = $operator . addslashes($aTerm) . $wildcard;
                 }
             }
         }
-        return implode(' ', $termsProcessed);
+        // Assemble the processed string
+        $processedSearchString = implode(' ', $termsProcessed);
+        // If double-quoted or brackets-wrapped terms had been extracted, put them back
+        if (count($searches) > 0) {
+            // Escape every string before replacing it again
+            $searches = array_map('addslashes', $searches);
+            $processedSearchString = str_replace($replacements, $searches, $processedSearchString);
+        }
+        return $processedSearchString;
     }
 }
index 4879b86..9f26daa 100644 (file)
@@ -44,6 +44,8 @@ fulltext query requires several components.
    boolean search, the latter a natural language search. Please refer
    to the `MySQL documentation for more details <http://dev.mysql.com/doc/refman/5.6/en/fulltext-search.html>`_.
 
+   All operators are supported except for the distance operator (:code:`@`).
+
    .. note::
 
       Query expansion is currently not supported.
index a44f0cc..2708f9c 100644 (file)
@@ -689,7 +689,7 @@ abstract class SqlBuilderTest extends UnitTestCase
     public function fulltextFilterProvider()
     {
         $filters = array(
-            // Boolean mode, one word valid, one word ignore
+            // Boolean mode, one word valid, one word ignored
             'fulltext, one valid word, one invalid word' => array(
                     'filter' => array(
                             'filters' => array(
@@ -708,7 +708,7 @@ abstract class SqlBuilderTest extends UnitTestCase
                             )
                     ),
                     'index' => 'SEARCH',
-                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'+"foox"\' IN BOOLEAN MODE))'
+                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'foox\' IN BOOLEAN MODE))'
             ),
             // Boolean mode, one word included, one word excluded
             'fulltext, one word included, one word excluded' => array(
@@ -720,7 +720,7 @@ abstract class SqlBuilderTest extends UnitTestCase
                                             'conditions' => array(
                                                     0 => array(
                                                             'operator' => 'fulltext',
-                                                            'value' => 'foox -barz',
+                                                            'value' => '+foox -barz',
                                                             'negate' => false
                                                     )
                                             )
@@ -728,7 +728,7 @@ abstract class SqlBuilderTest extends UnitTestCase
                             )
                     ),
                     'index' => 'SEARCH',
-                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'+"foox" -"barz"\' IN BOOLEAN MODE))'
+                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'+foox -barz\' IN BOOLEAN MODE))'
             ),
             // Boolean mode with quoted string
             'fulltext, quoted string' => array(
@@ -748,7 +748,87 @@ abstract class SqlBuilderTest extends UnitTestCase
                             )
                     ),
                     'index' => 'SEARCH',
-                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'+"go for foox"\' IN BOOLEAN MODE))'
+                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'\"go for foox\"\' IN BOOLEAN MODE))'
+            ),
+            // Boolean mode with two quoted strings
+            'fulltext, two quoted string' => array(
+                    'filter' => array(
+                            'filters' => array(
+                                    0 => array(
+                                            'table' => 'tt_content',
+                                            'field' => 'score',
+                                            'conditions' => array(
+                                                    0 => array(
+                                                            'operator' => 'fulltext',
+                                                            'value' => '"foo bar" "baz ding"',
+                                                            'negate' => false
+                                                    )
+                                            )
+                                    )
+                            )
+                    ),
+                    'index' => 'SEARCH',
+                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'\"foo bar\" \"baz ding\"\' IN BOOLEAN MODE))'
+            ),
+            // Boolean mode with unquoted and quoted strings
+            'fulltext, unquoted and quoted string' => array(
+                    'filter' => array(
+                            'filters' => array(
+                                    0 => array(
+                                            'table' => 'tt_content',
+                                            'field' => 'score',
+                                            'conditions' => array(
+                                                    0 => array(
+                                                            'operator' => 'fulltext',
+                                                            'value' => '"foo bar" -bazy',
+                                                            'negate' => false
+                                                    )
+                                            )
+                                    )
+                            )
+                    ),
+                    'index' => 'SEARCH',
+                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'\"foo bar\" -bazy\' IN BOOLEAN MODE))'
+            ),
+            // Boolean mode with brackets in string
+            'fulltext, brackets in string' => array(
+                    'filter' => array(
+                            'filters' => array(
+                                    0 => array(
+                                            'table' => 'tt_content',
+                                            'field' => 'score',
+                                            'conditions' => array(
+                                                    0 => array(
+                                                            'operator' => 'fulltext',
+                                                            'value' => '+foox (>bar <baz)',
+                                                            'negate' => false
+                                                    )
+                                            )
+                                    )
+                            )
+                    ),
+                    'index' => 'SEARCH',
+                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'+foox (>bar <baz)\' IN BOOLEAN MODE))'
+            ),
+            // Boolean mode with wildcard
+            'fulltext, with wildcard' => array(
+                    'filter' => array(
+                            'filters' => array(
+                                    0 => array(
+                                            'table' => 'tt_content',
+                                            'field' => 'score',
+                                            'conditions' => array(
+                                                    0 => array(
+                                                            'operator' => 'fulltext',
+                                                            'value' => 'foo*',
+                                                            'negate' => false
+                                                    )
+                                            )
+                                    )
+                            )
+                    ),
+                    'index' => 'SEARCH',
+                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'foo*\' IN BOOLEAN MODE))'
             ),
             // Boolean mode, negated condition
             'fulltext, negated condition' => array(
@@ -760,7 +840,6 @@ abstract class SqlBuilderTest extends UnitTestCase
                                             'conditions' => array(
                                                     0 => array(
                                                             'operator' => 'fulltext',
-                                                            // "bar" should be ignored, as it is below minimum word length
                                                             'value' => 'foox',
                                                             'negate' => true
                                                     )
@@ -769,7 +848,7 @@ abstract class SqlBuilderTest extends UnitTestCase
                             )
                     ),
                     'index' => 'SEARCH',
-                    'fulltextCondition' => '(NOT MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'+"foox"\' IN BOOLEAN MODE))'
+                    'fulltextCondition' => '(NOT MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'foox\' IN BOOLEAN MODE))'
             ),
             // Natural mode
             'fulltext natural' => array(
@@ -791,6 +870,26 @@ abstract class SqlBuilderTest extends UnitTestCase
                     'index' => 'SEARCH',
                     'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'foo bar\'))'
             ),
+            // Natural mode, quotes get escaped
+            'fulltext natural, quotes escaped' => array(
+                    'filter' => array(
+                            'filters' => array(
+                                    0 => array(
+                                            'table' => 'tt_content',
+                                            'field' => 'score',
+                                            'conditions' => array(
+                                                    0 => array(
+                                                            'operator' => 'fulltext_natural',
+                                                            'value' => 'foo don\'t',
+                                                            'negate' => false
+                                                    )
+                                            )
+                                    )
+                            )
+                    ),
+                    'index' => 'SEARCH',
+                    'fulltextCondition' => '(MATCH(tt_content.header,tt_content.bodytext) AGAINST(\'foo don\\\'t\'))'
+            ),
             // Empty search words
             'fulltext, empty search' => array(
                     'filter' => array(
@@ -841,7 +940,6 @@ abstract class SqlBuilderTest extends UnitTestCase
                                             'conditions' => array(
                                                     0 => array(
                                                             'operator' => 'fulltext',
-                                                            // "bar" should be ignored, as it is below minimum word length
                                                             'value' => 'foox bar',
                                                             'negate' => false
                                                     )
@@ -864,6 +962,7 @@ abstract class SqlBuilderTest extends UnitTestCase
      * @param string $fulltextCondition Interpreted condition
      * @test
      * @dataProvider fulltextFilterProvider
+     * @covers \Tesseract\Dataquery\Parser\FulltextParser
      */
     public function selectQueryWithFulltextFilter($filter, $index, $fulltextCondition)
     {
@@ -905,8 +1004,10 @@ abstract class SqlBuilderTest extends UnitTestCase
         $this->sqlParser->addFilter($filter);
         $actualResult = $this->sqlParser->buildQuery();
 
-        self::assertEquals($expectedResult, $actualResult,
-                '***Expected***' . $expectedResult . '***Actual***' . $actualResult);
+        self::assertEquals(
+                $expectedResult,
+                $actualResult
+        );
     }
 
     /**