Fixed a few small bugs in indexed-search
authorKasper Skårhøj <kasper@typo3.org>
Tue, 15 Feb 2005 15:53:53 +0000 (15:53 +0000)
committerKasper Skårhøj <kasper@typo3.org>
Tue, 15 Feb 2005 15:53:53 +0000 (15:53 +0000)
git-svn-id: https://svn.typo3.org/TYPO3v4/Core/trunk@563 709f56b5-9817-0410-a4d7-c38de5d9e867

t3lib/class.t3lib_cs.php
typo3/sysext/indexed_search/class.indexer.php
typo3/sysext/indexed_search/class.lexer.php
typo3/sysext/indexed_search/doc/TODO.txt
typo3/sysext/indexed_search/modfunc1/class.tx_indexedsearch_modfunc1.php

index bb2e86b..6983a0f 100755 (executable)
@@ -630,6 +630,8 @@ class t3lib_cs {
         */
        function utf8_encode($str,$charset)     {
 
+               if ($charset === 'utf-8')       return $str;
+
                        // Charset is case-insensitive.
                if ($this->initCharset($charset))       {       // Parse conv. table if not already...
                        $strLen = strlen($str);
@@ -647,21 +649,20 @@ class t3lib_cs {
                                        } else $outStr.=chr($this->noCharByteVal);      // No char exists
                                        $a++;
                                } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
-                                       if (isset($this->eucBasedSets[$charset]))       {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
+                                       if ($charset == 'shift_jis' && ($ord <160 || $ord>223)) {       // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
                                                $a++;
-                                               $ord2=ord(substr($str,$a,1));
+                                               $ord2 = ord(substr($str,$a,1));
                                                $ord = $ord*256+$ord2;
-                                       }
-                                       elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223))     {       // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
+                                       } elseif (isset($this->eucBasedSets[$charset])) {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
                                                $a++;
-                                               $ord2=ord(substr($str,$a,1));
+                                               $ord2 = ord(substr($str,$a,1));
                                                $ord = $ord*256+$ord2;
                                        }
 
                                        if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
-                                               $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
-                                       } else $outStr.=chr($this->noCharByteVal);      // No char exists
-                               } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
+                                               $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
+                                       } else $outStr.= chr($this->noCharByteVal);     // No char exists
+                               } else $outStr.= $chr;  // ... otherwise it's just ASCII 0-127 and one byte. Transparent
                        }
                        return $outStr;
                }
index 57de73d..7c525bc 100755 (executable)
@@ -1072,6 +1072,7 @@ class tx_indexedsearch_indexer {
                reset($contentArr);
                while(list($key,)=each($contentArr)) {
                        if (strlen($contentArr[$key]))  {
+
                                if ($charset!=='utf-8') {
                                        $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
                                }
@@ -1117,7 +1118,8 @@ class tx_indexedsearch_indexer {
                $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
                if ($maxL)      {
                                // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
-                       $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
+       #               $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
+                       $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
 
                                // Shorten the string:
                        $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
index a950790..c8fa9a0 100755 (executable)
@@ -144,7 +144,6 @@ class tx_indexedsearch_lexer {
                                $pos = $start+$len;
                        } else break;
                }
-
                return $words;
        }
 
index fe7de9d..0a60673 100755 (executable)
@@ -21,6 +21,8 @@ Backend modules:
 Implement stop-word setting in: ""Top-20 words by count:" and a list seperate from that (in main module?)
 </diverse>
 
+Unittest for t3lib_cs converting Euc/shift_jis
+
 Test kaniner (indexed search / caching?):
        - 3DS
        - Metropol
index f3afb61..41fd551 100755 (executable)
@@ -566,8 +566,10 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
                                $lines[] = '<td>&nbsp;</td>';
                                $lines[] = '<td>Title</td>';
                                $lines[] = '<td bgcolor="red">'.$this->printRemoveIndexed(implode(',',$this->allPhashListed),'Clear ALL phash-rows below!').'</td>';
-                               $lines[] = '<td>Content</td>';
-                               $lines[] = '<td>Words</td>';
+                               $lines[] = '<td>Content<br/>
+                                                       <img src="clear.gif" width="300" height="1" alt="" /></td>';
+                               $lines[] = '<td>Words<br/>
+                                                       <img src="clear.gif" width="300" height="1" alt="" /></td>';
                        break;
                        default:
                                $lines[] = '<td>&nbsp;</td>';
@@ -1147,6 +1149,7 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
         */
        function utf8_to_currentCharset($string)        {
                global $LANG;
+
                if ($LANG->charSet != 'utf-8')  {
                        $string = $LANG->csConvObj->utf8_decode($string, $LANG->charSet, TRUE);
                }