Commit d90f74db authored by Kasper Skårhøj's avatar Kasper Skårhøj
Browse files

Changes to indexed_search


git-svn-id: https://svn.typo3.org/TYPO3v4/Core/trunk@560 709f56b5-9817-0410-a4d7-c38de5d9e867
parent e4aa159e
This diff is collapsed.
......@@ -6007,7 +6007,7 @@ class tslib_cObj {
#?? $GLOBALS['TSFE']->sys_page->versionOL('pages',$row);
}
// Add record:
if (is_array($row) && $dontCheckEnableFields || $GLOBALS['TSFE']->checkPagerecordForIncludeSection($row)) {
if (is_array($row) && ($dontCheckEnableFields || $GLOBALS['TSFE']->checkPagerecordForIncludeSection($row))) {
// Add ID to list:
if ($begin<=0) {
if ($dontCheckEnableFields || $GLOBALS['TSFE']->checkEnableFields($row)) {
......
......@@ -1246,7 +1246,7 @@ class tslib_menu {
$uid = $mount_info['mount_pid'];
}
$recs = $this->sys_page->getMenu($uid,'uid,pid,doktype,mount_pid,mount_pid_ol');
$recs = $this->sys_page->getMenu($uid,'uid,pid,doktype,mount_pid,mount_pid_ol,nav_hide');
foreach($recs as $theRec) {
if (!t3lib_div::inList($this->doktypeExcludeList,$theRec['doktype']) && !$theRec['nav_hide']) { // If a menu item seems to be another type than 'Not in menu', then return true (there were items!)
return TRUE;
......
......@@ -823,7 +823,7 @@ class tslib_pibase {
if (isset($this->LOCAL_LANG[$this->LLkey][$key])) {
$word = $GLOBALS['TSFE']->csConv($this->LOCAL_LANG[$this->LLkey][$key]);
} elseif (isset($this->LOCAL_LANG['default'][$key])) {
$word = $this->LOCAL_LANG['default'][$key];
$word = $this->LOCAL_LANG['default'][$key]; // No charset conversion because default is english and thereby ASCII
} else {
$word = $this->LLtestPrefixAlt.$alt;
}
......
......@@ -90,18 +90,15 @@ class tslib_search {
var $group_by = 'PRIMARY_KEY'; // Alternatively 'PRIMARY_KEY'; sorting by primary key
var $default_operator = 'AND'; // Standard SQL-operator between words
var $operator_translate_table_caseinsensitive = '1';
var $operator_translate_table_caseinsensitive = TRUE;
var $operator_translate_table = Array ( // case-sensitiv. Defineres the words, which will be operators between words
Array ('+' , 'AND'),
Array ('|' , 'AND'),
Array ('-' , 'AND NOT'),
// english
Array ('AND' , 'AND'),
Array ('OR' , 'OR'),
Array ('NOT' , 'AND NOT'),
// danish
Array ('OG' , 'AND'),
Array ('ELLER' , 'OR'),
Array ('UDEN' , 'AND NOT')
Array ('and' , 'AND'),
Array ('or' , 'OR'),
Array ('not' , 'AND NOT'),
);
// Internal
......@@ -418,12 +415,12 @@ class tslib_search {
$op_array = $this->operator_translate_table;
reset ($op_array);
if ($this->operator_translate_table_caseinsensitive) {
$operator = strtoupper($operator);
$operator = strtolower($operator); // case-conversion is charset insensitive, but it doesn't spoil anything if input string AND operator table is already converted
}
while (list($key,$val) = each($op_array)) {
$item = $op_array[$key][0];
if ($this->operator_translate_table_caseinsensitive) {
$item = strtoupper($item);
$item = strtolower($item); // See note above.
}
if ($operator==$item) {
return $op_array[$key][1];
......@@ -463,10 +460,10 @@ class tslib_search {
* @return string URL-parameters with the searchwords
*/
function get_searchwords() {
$SWORD_PARAMS='';
$SWORD_PARAMS = '';
if (is_array($this->sword_array)) {
foreach($this->sword_array as $key => $val) {
$SWORD_PARAMS.='&sword_list[]='.rawurlencode($val['sword']);
$SWORD_PARAMS.= '&sword_list[]='.rawurlencode($val['sword']);
}
}
return $SWORD_PARAMS;
......@@ -480,7 +477,7 @@ class tslib_search {
function get_searchwordsArray() {
if (is_array($this->sword_array)) {
foreach($this->sword_array as $key => $val) {
$swords[]=$val['sword'];
$swords[] = $val['sword'];
}
}
return $swords;
......
......@@ -77,6 +77,8 @@ class tx_indexed_search_extparse {
// This array is configured in initialization:
var $app = array();
var $ext2itemtype_map = array();
var $supportedExtensions = array();
var $pObj; // Reference to parent object (indexer class)
......@@ -95,6 +97,7 @@ class tx_indexed_search_extparse {
// If windows, apply extension to tool name:
$exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
$extOK = FALSE;
$mainExtension = '';
// Ignore extensions
$ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
......@@ -187,32 +190,39 @@ class tx_indexed_search_extparse {
} else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
break;
case 'txt': // Raw text
case 'html': // PHP strip-tags()
case 'htm': // PHP strip-tags()
case 'csv': // Raw text
case 'xml': // PHP strip-tags()
case 'tif': // PHP EXIF
$extOK = TRUE;
break;
case 'html': // PHP strip-tags()
case 'htm': // PHP strip-tags()
$extOK = TRUE;
$mainExtension = 'html'; // making "html" the common "item_type"
break;
case 'jpg': // PHP EXIF
case 'jpeg': // PHP EXIF
case 'tif': // PHP EXIF
$extOK = TRUE;
$mainExtension = 'jpeg'; // making "jpeg" the common item_type
break;
}
// If extension was OK:
if ($extOK) {
$this->supportedExtensions[$extension] = TRUE;
$this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
return TRUE;
}
}
/**
* Initialize external parser for backend modules
* Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc).
* Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
*
* @param string File extension to initialize for.
* @return boolean Returns true if the extension is supported and enabled, otherwise false.
*/
function initBackend($extension) {
function softInit($extension) {
switch($extension) {
case 'pdf': // PDF
case 'doc': // MS Word files
......@@ -236,6 +246,94 @@ class tx_indexed_search_extparse {
}
}
/**
* Return title of entry in media type selector box.
*
* @param string File extension
* @return string String with label value of entry in media type search selector box (frontend plugin).
*/
function searchTypeMediaTitle($extension) {
// Read indexer-config
$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
// Ignore extensions
$ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
if (in_array($extension, $ignoreExtensions)) {
return FALSE;
}
// Switch on file extension:
switch($extension) {
case 'pdf':
// PDF
if ($indexerConfig['pdftools']) {
return 'PDF';
}
break;
case 'doc':
// Catdoc
if ($indexerConfig['catdoc']) {
return 'MS Word';
}
break;
case 'pps': // MS PowerPoint(?)
case 'ppt': // MS PowerPoint
// ppthtml
if ($indexerConfig['ppthtml']) {
return 'MS Powerpoint';
}
break;
case 'xls': // MS Excel
// Xlhtml
if ($indexerConfig['xlhtml']) {
return 'MS Excel';
}
break;
case 'sxc': // Open Office Calc.
case 'sxi': // Open Office Impress
case 'sxw': // Open Office Writer
if ($indexerConfig['nativeOOMethod'] || $indexerConfig['ruby']) {
return 'Open Office';
}
break;
case 'rtf':
// Catdoc
if ($indexerConfig['unrtf']) {
return 'RTF';
}
break;
case 'html': // PHP strip-tags()
case 'jpeg': // PHP EXIF
case 'txt': // Raw text
case 'csv': // Raw text
case 'xml': // PHP strip-tags()
case 'tif': // PHP EXIF
return strtoupper($extension);
break;
// NO entry (duplicates or blank):
case 'htm': // PHP strip-tags()
case 'jpg': // PHP EXIF
default:
break;
}
}
/**
* Returns true if the input extension (item_type) is a potentially a multi-page extension
*
* @param string Extension / item_type string
* @return boolean Return true if multi-page
*/
function isMultiplePageExtension($extension) {
// Switch on file extension:
switch((string)$extension) {
case 'pdf':
return TRUE;
break;
}
}
......
......@@ -920,7 +920,7 @@ class tx_indexedsearch_indexer {
$this->internal_log = array();
$this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
$Pstart = t3lib_div::milliseconds();
$subinfo = array('key' => $cPKey);
$subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
$phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
$check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
if ($check > 0 || $force) {
......@@ -1386,6 +1386,10 @@ class tx_indexedsearch_indexer {
*/
function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
// Find item Type:
$storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
$storeItemType = $storeItemType ? $storeItemType : $ext;
// Remove any current data for this phash:
$this->removeOldIndexedFiles($hash['phash']);
......@@ -1399,7 +1403,7 @@ class tx_indexedsearch_indexer {
'cHashParams' => serialize($subinfo),
'contentHash' => $content_md5h,
'data_filename' => $file,
'item_type' => $ext,
'item_type' => $storeItemType,
'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
'item_description' => $this->bodyDescription($contentParts),
'item_mtime' => $mtime,
......
TIRSDAG:
- Improve lexer:
- See BASIC_LEXER from Oracle
- CJK hack from Zope.
......@@ -5,14 +6,52 @@
- Implement in search query analysis.
- TESTING with russian, danish, chinese, japanese etc...
- CVS
- split search words by new lexer function! (getSearchWords)
**************'
ONSDAG:
- Crawler
- Index Configurations / CLI indexer
TORSDAG / FREDAG:
- Test / Koordinering.
<diverse>
Search test:
- "message appears" - viste external media som IKKE var indekseret!?
- external media respect privacy of pages?
- external media on multiple pages with DIFFERENT languages?
- checkResume: Should it also check for gr_list "0,-1"?
Example på alternativ søgning!
XHTML i frontend?
Backend modules:
- Proper skinning? / getLL? / XHTML
-------------------------------
</diverse>
Test kaniner (indexed search / caching?):
- 3DS
- Metropol
- FI
- Link Factory
- Brunata
- TYPO3.org copy
**************
getLL with XML-support?
CRAWLER:
Purpose: To request URLs
Special instructions:
- Re-cache
- Publish
- Index
(combinations?)
(status data can be stored back in URL-record)
&L=[_TABLE:sys_language;_PID:0:_tx_indexedsearch_fields:bodytext,header]
&[_LOGIN]=[,kasper,francis;_PID:]
&myext[uid]=[1-34,35,36-10]&another=1?
......@@ -25,31 +64,27 @@ CRAWLER:
- strlen?
***************
TODO / projects:
*****************
Documentation:
- Configuration possibilities (piVars, TypoScript, Hooks etc)
- How to setup up, analyse and debug indexed search (manual)
- Technical:
- utf-8 internally.
- Updates on tables structure
************************************************************
Statistics module:
- Someone write a statistics display module for the search operation! (Displaying content from index_stat_search and index_stat_word)
Olivier Dobberkau / dkd is on this.
Templating / Display in plugin:
- Localization, configuration of search-options, stylesheet formatting of result content (with new CSS Stylesheet Editor)
- Templating
- with Template Voila TOs?
- other approaches? (markers seem straight forward)
- linkPage() function
- Link correctly to MP links / external documents?
- Link correctly to external documents / URLs?
- CHECK: Which keys are necessary??? There are four keys on typo3.com. Are they all used by the indexer or what?
- Is result links working for frames? (&type=1) See Message-ID: <3DA762A0.84BDA4F1@kuehn.com>
- Implement Stop-words in search
Clean up backend modules:
- getLL()
- skinning()
- XHTML()
Misc:
- DOC: Tutorial on setting it up, getting it to run, trouble shoot it.
- add possibility of cron-job based crawler-indexing of any external site (based on configuration record in the page tree. Access to that page will determine whether external URL is part of result. Just like the external media is.)
- flags i pages tabel: set_for_indexing (reset when indexed), do not index, ...
Various:
- The checkbox "No Search" in the page header is only respected by indexed_search during indexing! (A page will not be indexed when "No Search" is set). However when searching results are not filtered based on this flag - so if a page is indexed before the no search flag is set it will be found in search results. To change this is hard because the getTreeList() function that fetches all page ids cannot take a where-clause to filter it out but must have hardcoded support. Alternatively the pages table must be joined into the search result so we can select on the field. A solution is still not agreed upon.
- The Tools>Indexing module could need some shining up and more useful features (Someone else does this?)
- CLI til removal of old indexes: First set flag, then 14 days later remove the records.
- CLI script for removal of old indexes: First set flag, then 14 days later remove the records.
Templating / Display in plugin:
- Templating
- with new Template API?
- Still need to put a group together.
......@@ -32,4 +32,18 @@ $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] = array(
'tif' => 'EXT:indexed_search/class.external_parser.php:&tx_indexed_search_extparse',
);
// EXAMPLE configuration of hooks:
/*
$TYPO3_CONF_VARS['EXTCONF']['indexed_search']['pi1_hooks'] = array (
'initialize_postProc' => 'EXT:indexed_search/example/class.pihook.php:&tx_indexedsearch_pihook',
'getResultRows' => 'EXT:indexed_search/example/class.pihook.php:&tx_indexedsearch_pihook',
'printResultRow' => 'EXT:indexed_search/example/class.pihook.php:&tx_indexedsearch_pihook',
'prepareResultRowTemplateData_postProc' => 'EXT:indexed_search/example/class.pihook.php:&tx_indexedsearch_pihook',
);
*/
// EXAMPLE of adding fields to root line:
#$TYPO3_CONF_VARS['EXTCONF']['indexed_search']['addRootLineFields']['level3'] = 3;
?>
\ No newline at end of file
......@@ -85,6 +85,7 @@ CREATE TABLE index_section (
uniqid int(11) DEFAULT '0' NOT NULL auto_increment,
PRIMARY KEY (uniqid),
KEY joinkey (phash,rl0),
# KEY phash_pid (phash,page_id),
KEY page_id (page_id),
KEY rl0 (rl0,rl1,phash),
KEY rl0_2 (rl0,phash)
......
......@@ -20,7 +20,7 @@ plugin.tx_indexedsearch {
lang=0
desc=0
results=0
# defOp.1=1
# extResume=1
}
......@@ -31,19 +31,37 @@ plugin.tx_indexedsearch {
path_stdWrap {
}
search {
rootPidList =
rootPidList =
page_links = 10
}
result_link_target =
/*
flagRendering = CASE
flagRendering {
key.current = 1
2 = TEXT
2.value = German
default = TEXT
default.value = English
}
*/
/*
iconRendering = CASE
iconRendering {
key.current = 1
html = TEXT
html.value = HtmL
default = TEXT
default.value = TYPO3 pages
}
*/
tableParams {
secHead = border=0 cellpadding=0 cellspacing=0 width="100%"
searchBox = border=0 cellpadding=0 cellspacing=0
searchRes = border=0 cellpadding=0 cellspacing=0 width="100%"
}
# Setting default values for piVars (please see the source code for the form-field names which you can preset values for here)
_DEFAULT_PI_VARS.extResume=1
_CSS_DEFAULT_STYLE (
......@@ -61,7 +79,8 @@ plugin.tx_indexedsearch {
.tx-indexedsearch .tx-indexedsearch-res .tx-indexedsearch-secHead { margin-top:20px; margin-bottom:5px; }
.tx-indexedsearch .tx-indexedsearch-res .tx-indexedsearch-secHead H2 { margin-top:0px; margin-bottom:0px; }
.tx-indexedsearch .tx-indexedsearch-res .tx-indexedsearch-secHead TABLE { background:#cccccc; }
.tx-indexedsearch .tx-indexedsearch-res .tx-indexedsearch-secHead TD { vertical-align:absmiddle; }
.tx-indexedsearch .tx-indexedsearch-res .tx-indexedsearch-secHead TD { vertical-align:middle; }
.tx-indexedsearch .tx-indexedsearch-res .noResume { color : #666666; }
)
_LOCAL_LANG {
}
......
......@@ -29,6 +29,9 @@
*
* @author Kasper Skrhj <kasperYYYY@typo3.com>
*/
/**
* [CLASS/FUNCTION INDEX of SCRIPT]
*/
unset($MCONF);
......@@ -41,9 +44,14 @@ t3lib_extMgm::isLoaded("indexed_search",1);
require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
// ***************************
// Script Classes
// ***************************
/**
* Backend module providing boring statistics of the index-tables.
*
* @author Kasper Skaarhoj <kasperYYYY@typo3.com>
* @package TYPO3
* @subpackage tx_indexedsearch
*/
class SC_mod_tools_isearch_index {
var $MCONF=array();
var $MOD_MENU=array();
......@@ -53,6 +61,9 @@ class SC_mod_tools_isearch_index {
var $include_once=array();
var $content;
/**
* Initialization
*/
function init() {
global $BE_USER,$LANG,$BACK_PATH,$TCA_DESCR,$TCA,$CLIENT,$TYPO3_CONF_VARS;
$this->MCONF = $GLOBALS["MCONF"];
......@@ -138,9 +149,23 @@ class SC_mod_tools_isearch_index {
}
// ***************************
// OTHER FUNCTIONS:
// ***************************
/***************************
*
* OTHER FUNCTIONS:
*
***************************/
/**
*
*/
function getRecordsNumbers() {
$tables=explode(",","index_phash,index_words,index_rel,index_grlist,index_section,index_fulltext");
$recList=array();
......
......@@ -177,7 +177,7 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
$this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
// Init parser and if it returns false, unset its entry again:
if (!$this->external_parsers[$extension]->initBackend($extension)) {
if (!$this->external_parsers[$extension]->softInit($extension)) {
unset($this->external_parsers[$extension]);
}
}
......@@ -1133,10 +1133,10 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
if ($fullPath) {
$info = @getimagesize($fullPath);
$iconPath = $GLOBALS['BACK_PATH'].'../'.substr($fullPath,strlen(PATH_site));
$this->iconFileNameCache[$it] = is_array($info) ? '<img src="'.$iconPath.'" '.$info[3].' title="'.htmlspecialchars($alt).'" alt="" />' : '';
$this->iconFileNameCache[$it] = is_array($info) ? '<img src="'.$iconPath.'" '.$info[3].' title="###TITLE_ATTRIBUTE###" alt="" />' : '';
}
}
return $this->iconFileNameCache[$it];
return str_replace('###TITLE_ATTRIBUTE###',htmlspecialchars($it.': '.$alt),$this->iconFileNameCache[$it]);
}
/**
......@@ -1310,6 +1310,7 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
$indexerObj->backend_initIndexer($this->pObj->id, 0, 0, '', $rl, $GETparams, $cfgRow['chashcalc'] ? TRUE : FALSE);
$indexerObj->backend_setFreeIndexUid($cfgRow['uid']);
$theContent = '';
foreach($fieldList as $k => $v) {
if (!$k) {
$theTitle = $r[$v];
......@@ -1317,6 +1318,7 @@ class tx_indexedsearch_modfunc1 extends t3lib_extobjbase {
$theContent.= $r[$v].' ';
}
}
debug($theContent,$theTitle);
$indexerObj->backend_indexAsTYPO3Page(
$theTitle,
'',
......
......@@ -23,9 +23,6 @@ $LOCAL_LANG = Array (
'opt_media_-1' => 'All media',
'opt_media_0' => 'Internal pages',
'opt_media_-2' => 'All External',
'opt_media_1' => 'HTML',
'opt_media_2' => 'PDF',
'opt_media_3' => 'MS Word',
'opt_order_rank_flag' => 'Weight/Frequency',
'opt_order_rank_freq' => 'Frequency',
'opt_order_rank_first' => 'Close to top',
......@@ -88,6 +85,8 @@ $LOCAL_LANG = Array (
'local_operator_AND' => 'AND',
'local_operator_OR' => 'OR',
'local_operator_NOT' => 'NOT',
'makerating_addToCurrentSearch' => 'Add to current search words',
'maketitle_matches' => 'matches',
),
'dk' => Array (
'submit_button_label' => 'Sg',
......@@ -380,7 +379,7 @@ OG, ELLER og IKKE er kommandoord som overskriver standard s
'rules_header' => 'Regole:',
'rules_text' => 'Sono accettate solo parole con due o pi caratteri, per un massimo di 200 caratteri totali.
Lo spazio usato per separare le parole, "" pu essere usato per cercare un\'intera stringa.
AND, OR e NOT possono essere usati prefissi alle parole da cercare.
AND, OR e NOT possono essere usati prefissi alle parole da cercare.
+/|/- corrispondono agli operatori AND, OR e NOT.
Tutte le parole sono convertite in caratteri minuscoli.',
'searchFor' => 'Cerca',
......@@ -1336,7 +1335,7 @@ Sve tra
'form_match' => 'Illeszkeds:',
'form_style' => 'Stlus:',