Index: database/database.mysql =================================================================== RCS file: /cvs/drupal/drupal/database/database.mysql,v retrieving revision 1.192 diff -u -r1.192 database.mysql --- database/database.mysql 31 Jul 2005 10:12:47 -0000 1.192 +++ database/database.mysql 4 Aug 2005 01:10:55 -0000 @@ -539,6 +539,16 @@ ) TYPE=MyISAM; -- +-- Table structure for table 'search_dataset' +-- +CREATE TABLE search_dataset ( + sid int(10) unsigned NOT NULL default '0', + type varchar(16) default NULL, + data longtext NOT NULL, + KEY sid_type (sid, type) +) TYPE=MyISAM; + +-- -- Table structure for table 'search_index' -- @@ -549,8 +559,8 @@ fromsid int(10) unsigned NOT NULL default '0', fromtype varchar(16) default NULL, score int(10) unsigned default NULL, - KEY sid (sid), - KEY fromsid (fromsid), + KEY sid_type (sid, type), + KEY from_sid_type (fromsid, fromtype), KEY word (word) ) TYPE=MyISAM; @@ -560,7 +570,7 @@ CREATE TABLE search_total ( word varchar(50) NOT NULL default '', - count int(10) unsigned default NULL, + count float default NULL, PRIMARY KEY (word) ) TYPE=MyISAM; Index: database/database.pgsql =================================================================== RCS file: /cvs/drupal/drupal/database/database.pgsql,v retrieving revision 1.131 diff -u -r1.131 database.pgsql --- database/database.pgsql 31 Jul 2005 10:12:47 -0000 1.131 +++ database/database.pgsql 4 Aug 2005 01:10:56 -0000 @@ -548,6 +548,16 @@ ); -- +-- Table structure for table 'search_dataset' +-- +CREATE TABLE search_dataset ( + sid integer NOT NULL default '0', + type varchar(16) default NULL, + data text NOT NULL default '', + KEY sid_type (sid, type) +); + +-- -- Table structure for search_index -- @@ -559,8 +569,8 @@ fromtype varchar(16) default NULL, score integer default NULL ); -CREATE INDEX search_index_sid_idx ON search_index(sid); -CREATE INDEX search_index_fromsid_idx ON search_index(fromsid); +CREATE INDEX search_index_sid_type_idx ON search_index(sid, type); +CREATE INDEX search_index_from_sid_type_idx ON search_index(fromsid, fromtype); CREATE INDEX search_index_word_idx ON search_index(word); -- Index: database/updates.inc =================================================================== RCS file: /cvs/drupal/drupal/database/updates.inc,v retrieving revision 1.123 diff -u -r1.123 updates.inc --- database/updates.inc 31 Jul 2005 10:12:47 -0000 1.123 +++ database/updates.inc 4 Aug 2005 01:11:00 -0000 @@ -119,7 +119,8 @@ "2005-05-12" => "update_140", "2005-05-22" => "update_141", "2005-07-29" => "update_142", - "2005-07-30" => "update_143" + "2005-07-30" => "update_143", + '2005-08-04" => "update_144" ); function update_32() { @@ -2533,6 +2534,70 @@ return $ret; } +function update_144() { + $ret = array(); + + $ret[] = update_sql("DELETE FROM {variable} WHERE name = 'node_cron_last'"); + $ret[] = update_sql("DELETE FROM {variable} WHERE name = 'minimum_word_size'"); + $ret[] = update_sql("DELETE FROM {variable} WHERE name = 'remove_short'"); + + $ret[] = update_sql('DROP TABLE {search_index}'); + $ret[] = update_sql('DROP TABLE {search_total}'); + + if ($GLOBALS['db_type'] == 'mysql') { + $ret[] = update_sql("CREATE TABLE {search_dataset} ( + sid int(10) unsigned NOT NULL default '0', + type varchar(16) default NULL, + data longtext NOT NULL, + KEY sid_type (sid, type) + )"); + + $ret[] = update_sql("CREATE TABLE {search_index} ( + word varchar(50) NOT NULL default '', + sid int(10) unsigned NOT NULL default '0', + type varchar(16) default NULL, + fromsid int(10) unsigned NOT NULL default '0', + fromtype varchar(16) default NULL, + score int(10) unsigned default NULL, + KEY sid_type (sid, type), + KEY from_sid_type (fromsid, fromtype), + KEY word (word) + )"); + + $ret[] = update_sql("CREATE TABLE {search_total} ( + word varchar(50) NOT NULL default '', + count float default NULL, + PRIMARY KEY word (word) + )"); + } + elseif ($GLOBALS['db_type'] == 'pgsql') { + $ret[] = update_sql("CREATE TABLE {search_dataset} ( + sid integer NOT NULL default '0', + type varchar(16) default NULL, + data text NOT NULL default '', + KEY sid_type (sid, type) + )"); + + $ret[] = update_sql("CREATE TABLE {search_index} ( + word varchar(50) NOT NULL default '', + sid integer NOT NULL default '0', + type varchar(16) default NULL, + fromsid integer NOT NULL default '0', + fromtype varchar(16) default NULL, + score integer default NULL + )"); + $ret[] = update_sql("CREATE INDEX search_index_sid_type_idx ON {search_index}(sid, type)"); + $ret[] = update_sql("CREATE INDEX search_index_from_sid_type_idx ON {search_index}(fromsid, fromtype)"); + $ret[] = update_sql("CREATE INDEX search_index_word_idx ON {search_index}(word)"); + + $ret[] = update_sql("CREATE TABLE {search_total} ( + word varchar(50) NOT NULL default '', + count float default NULL + )"); + $ret[] = update_sql("CREATE INDEX search_total_word_idx ON {search_total}(word)"); + } +} + function update_sql($sql) { $edit = $_POST["edit"]; $result = db_query($sql); cvs diff: Diffing includes Index: includes/database.mysql.inc =================================================================== RCS file: /cvs/drupal/drupal/includes/database.mysql.inc,v retrieving revision 1.33 diff -u -r1.33 database.mysql.inc --- includes/database.mysql.inc 30 Jul 2005 18:01:51 -0000 1.33 +++ includes/database.mysql.inc 4 Aug 2005 01:11:01 -0000 @@ -238,6 +238,45 @@ } /** + * Runs a SELECT query and stores its results in a temporary table. + * + * Use this as a substitute for db_query() when the results need to stored + * in a temporary table. Temporary tables exist for the duration of the page + * request. + * User-supplied arguments to the query should be passed in as separate parameters + * so that they can be properly escaped to avoid SQL injection attacks. + * + * @param $query + * A string containing a normal SELECT SQL query. + * @param ... + * A variable number of arguments which are substituted into the query using + * printf() syntax. Instead of a variable number of query arguments, you may + * also pass a single array containing the query arguments. + * @param $table + * The name of the temporary table to select into. This name will not be + * prefixed as there is no risk of collision. + * @return + * A database query result resource, or FALSE if the query was not executed + * correctly. + */ +function db_query_temporary($query) { + $args = func_get_args(); + $tablename = array_pop($args); + + $query = preg_replace('/^SELECT/i', 'CREATE TEMPORARY TABLE '. $tablename .' SELECT', db_prefix_tables($query)); + if (count($args) > 1) { + // Check for array (alternative syntax). + if (is_array($args[1])) { + $args = array_merge(array($query), $args[1]); + } + $args = array_map('db_escape_string', $args); + $args[0] = $query; + $query = call_user_func_array('sprintf', $args); + } + return _db_query($query); +} + +/** * Returns a properly formatted Binary Large OBject value. * * @param $data Index: includes/database.pgsql.inc =================================================================== RCS file: /cvs/drupal/drupal/includes/database.pgsql.inc,v retrieving revision 1.11 diff -u -r1.11 database.pgsql.inc --- includes/database.pgsql.inc 30 Jul 2005 18:01:51 -0000 1.11 +++ includes/database.pgsql.inc 4 Aug 2005 01:11:01 -0000 @@ -223,6 +223,45 @@ } /** + * Runs a SELECT query and stores its results in a temporary table. + * + * Use this as a substitute for db_query() when the results need to stored + * in a temporary table. Temporary tables exist for the duration of the page + * request. + * User-supplied arguments to the query should be passed in as separate parameters + * so that they can be properly escaped to avoid SQL injection attacks. + * + * @param $query + * A string containing a normal SELECT SQL query. + * @param ... + * A variable number of arguments which are substituted into the query using + * printf() syntax. Instead of a variable number of query arguments, you may + * also pass a single array containing the query arguments. + * @param $table + * The name of the temporary table to select into. This name will not be + * prefixed as there is no risk of collision. + * @return + * A database query result resource, or FALSE if the query was not executed + * correctly. + */ +function db_query_range($query) { + $args = func_get_args(); + $tablename = array_pop($args); + + $query = preg_replace('/^SELECT/i', 'CREATE TEMPORARY TABLE '. $tablename .' AS', db_prefix_tables($query)); + if (count($args) > 1) { + // Check for array (alternative syntax). + if (is_array($args[1])) { + $args = array_merge(array($query), $args[1]); + } + $args = array_map('db_escape_string', $args); + $args[0] = $query; + $query = call_user_func_array('sprintf', $args); + } + return _db_query($query); +} + +/** * Returns a properly formatted Binary Large OBject value. * * @param $data cvs diff: Diffing misc Index: misc/drupal.css =================================================================== RCS file: /cvs/drupal/drupal/misc/drupal.css,v retrieving revision 1.111 diff -u -r1.111 drupal.css --- misc/drupal.css 29 Jul 2005 06:59:29 -0000 1.111 +++ misc/drupal.css 4 Aug 2005 01:11:02 -0000 @@ -459,6 +459,13 @@ .search-results .search-info { font-size: 0.85em; } +.search-advanced .criterium { + float: left; + margin-right: 2em; +} +.search-advanced .action { + clear: left; +} #tracker td.replies { text-align: center; } cvs diff: Diffing modules Index: modules/node.module =================================================================== RCS file: /cvs/drupal/drupal/modules/node.module,v retrieving revision 1.514 diff -u -r1.514 node.module --- modules/node.module 1 Aug 2005 05:14:05 -0000 1.514 +++ modules/node.module 4 Aug 2005 01:11:08 -0000 @@ -548,17 +548,48 @@ switch ($op) { case 'name': return t('content'); + case 'reset': variable_del('node_cron_last'); return; + case 'status': $last = variable_get('node_cron_last', 0); $total = db_result(db_query('SELECT COUNT(*) FROM {node} WHERE status = 1 AND moderate = 0')); $remaining = db_result(db_query('SELECT COUNT(*) FROM {node} n LEFT JOIN {node_comment_statistics} c ON n.nid = c.nid WHERE n.status = 1 AND n.moderate = 0 AND (n.created > %d OR n.changed > %d OR c.last_comment_timestamp > %d)', $last, $last, $last)); return array('remaining' => $remaining, 'total' => $total); + case 'search': + // Build conditions list($join, $where) = _db_rewrite_sql(); - $find = do_search($keys, 'node', 'INNER JOIN {node} n ON n.nid = i.sid '. $join .' INNER JOIN {users} u ON n.uid = u.uid', 'n.status = 1'. (empty($where) ? '' : ' AND '. $where)); + $arguments = array(); + $conditions = 'n.status = 1'; + + if ($type = search_query_extract($keys, 'type')) { + $types = array(); + foreach (explode(',', $type) as $t) { + $types[] = "n.type = '%s'"; + $arguments[] = $t; + } + $conditions .= ' AND ('. implode(' OR ', $types) .')'; + $keys = search_query_insert($keys, 'type'); + } + + if ($category = search_query_extract($keys, 'category')) { + $categories = array(); + foreach (explode(',', $category) as $c) { + $categories[] = "tn.tid = %d"; + $arguments[] = $c; + } + $conditions .= ' AND ('. implode(' OR ', $categories) .')'; + $keys = search_query_insert($keys, 'category'); + $join .= ' INNER JOIN {term_node} tn ON n.nid = tn.nid'; + } + + // Do search + $find = do_search($keys, 'node', 'INNER JOIN {node} n ON n.nid = i.sid '. $join .' INNER JOIN {users} u ON n.uid = u.uid', $conditions . (empty($where) ? '' : ' AND '. $where), $arguments); + + // Load results $results = array(); foreach ($find as $item) { $node = node_load($item); @@ -579,10 +610,58 @@ 'title' => $node->title, 'user' => theme('username', $node), 'date' => $node->changed, + 'node' => $node, 'extra' => $extra, 'snippet' => search_excerpt($keys, $node->body)); } return $results; + + case 'form': + // Keyword boxes + $group = '
'. form_textfield(t('Containing any of the words'), 'or', '', 30, 255); + $group .= form_textfield(t('Containing the phrase'), 'phrase', '', 30, 255); + $group .= form_textfield(t('Containing none of the words'), 'negative', '', 30, 255) .'
'; + + // Taxonomy box + if ($taxonomy = module_invoke('taxonomy', 'form_all')) { + // explode(',', search_query_extract($keys, 'category')) + $group .= '
'. form_select('Only in the category', 'category', array(), $taxonomy, 0, 'size="7"', TRUE) .'
'; + } + + // Node types + $types = array(); + foreach (node_list() as $type => $module) { + $types[$type] = node_invoke($type, 'node_name'); + } + // explode(',', search_query_extract($keys, 'type')) + $group .= '
'. form_checkboxes(t('Only of the type'), 'type', array(), $types) .'
'; + + $group .= '
'. form_submit(t('Advanced Search')) .'
'; + return form_group_collapsible(t('Advanced search'), $group, TRUE, NULL, array('class' => 'search-advanced')); + + case 'post': + // Insert extra restrictions into the search keywords string. + $edit = &$_POST['edit']; + if (is_array($edit['type'])) { + $keys = search_query_insert($keys, 'type', implode(',', $edit['type'])); + } + if (is_array($edit['category'])) { + $keys = search_query_insert($keys, 'category', implode(',', $edit['category'])); + } + if ($edit['or'] != '') { + if (preg_match_all('/ ("[^"]+"|[^" ]+)/i', ' '. $edit['or'], $matches)) { + $keys = $keys .' '. implode(' OR ', $matches[1]); + } + } + if ($edit['negative'] != '') { + if (preg_match_all('/ ("[^"]+"|[^" ]+)/i', ' '. $edit['negative'], $matches)) { + $keys = $keys .' -'. implode(' -', $matches[1]); + } + } + if ($edit['phrase'] != '') { + $keys .= ' "'. str_replace('"', ' ', $edit['phrase']) .'"'; + } + return trim($keys); } } Index: modules/search.module =================================================================== RCS file: /cvs/drupal/drupal/modules/search.module,v retrieving revision 1.132 diff -u -r1.132 search.module --- modules/search.module 29 Jul 2005 08:18:20 -0000 1.132 +++ modules/search.module 4 Aug 2005 01:11:11 -0000 @@ -43,6 +43,13 @@ define('PREG_CLASS_PUNCTUATION', '\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-\x{ff65}\x{10100}\x{10101}\x{1039f}'); /** + * Matches all CJK characters that are candidates for auto-splitting + * (Chinese, Japanese, Korean). + * Contains kana and BMP ideographs. + */ +define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}'); + +/** * Implementation of hook_help(). */ function search_help($section = 'admin/help#search') { @@ -56,9 +63,9 @@ case 'search#noresults': return t('

', array('%number' => variable_get('minimum_word_size', 3))); +
  • Remove quotes around phrases to match each word individually: "blue smurf" will match less than blue smurf.
  • +
  • Consider loosening your query with OR: Drupal CMS will match less than Drupal OR CMS.
  • +

    '); } } @@ -127,7 +134,7 @@ function search_admin() { if ($_POST) { // If the word length settings change, the index needs to be rebuilt. - if (variable_get('minimum_word_size', 3) != $_POST['edit']['minimum_word_size']) { + if (variable_get('minimum_word_size', 5) != $_POST['edit']['minimum_word_size']) { drupal_set_message(t('The index will be rebuilt.')); search_wipe(); system_settings_save(); @@ -158,8 +165,8 @@ $output .= form_group(t('Indexing throttle'), $group); // Indexing settings: $group = ''. t('

    Changing the setting below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.

    The default settings should be appropriate for the majority of sites.

    ') .'
    '; - $group .= form_textfield(t('Minimum word length to index'), 'minimum_word_size', variable_get('minimum_word_size', 3), 5, 3, t('The number of characters a word has to be to be indexed. Words shorter than this will not be searchable.')); - $group .= form_textfield(t('Minimum word length to search for'), 'remove_short', variable_get('remove_short', 3), 5, 3, t('The number of characters a word has to be to be searched for, including wildcard characters.')); + $group .= form_textfield(t('Minimum word length to index'), 'minimum_word_size', variable_get('minimum_word_size', 5), 5, 3, t('The number of characters a word has to be to be indexed. Words shorter than this will not be searchable.')); + $group .= form_textfield(t('Minimum word/phrase length to search for'), 'remove_short', variable_get('remove_short', 3), 5, 3, t('The number of characters a word or phrase has to be to be searched for.')); $output .= form_group(t('Indexing settings'), $group); return system_settings_form($output); @@ -179,6 +186,7 @@ module_invoke_all('search', 'reset'); } else { + db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type); db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type); db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type); } @@ -212,7 +220,11 @@ } // Update word counts for new/changed words foreach (search_dirty() as $word => $dummy) { + // Get total count $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word)); + // Apply Zipf's law + $total = log10(1 + 1/(max(1, $total))); + // Store IDF db_query("UPDATE {search_total} SET count = %d WHERE word = '%s'", $total, $word); if (!db_affected_rows()) { db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %d)", $word, $total); @@ -228,22 +240,18 @@ } /** - * Splits a string into component words according to indexing rules. + * Simplifies a string according to indexing rules. */ -function search_keywords_split($text) { - static $last = null; - static $lastsplit = null; - - if ($last == $text) { - return $lastsplit; - } - +function search_simplify($text) { // Decode entities to UTF-8 $text = decode_entities($text); // Call an external processor for word handling. search_preprocess($text); + // Baseline CJK handling + $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'search_expand_cjk', $text); + // To improve searching for numerical data such as dates, IP addresses // or version numbers, we consider a group of numerical characters // separated only by punctuation characters to be one piece. @@ -260,9 +268,45 @@ // marks, spacers, etc, to be a word boundary. $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text); + return $text; +} + +/** + * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping + * pairs of characters. + */ +function search_expand_cjk($matches) { + $tokens = ' '; + // Split off first character + $last = drupal_substr($matches[0], 0, 1); + $str = substr($matches[0], strlen($last)); + // Begin loop + $l = drupal_strlen($str); + for ($i = 0; $i < $l; ++$i) { + // Grab next character + $current = drupal_substr($str, 0, 1); + $str = substr($str, strlen($last)); + $tokens .= $last . $current .' '; + $last = $current; + } + return $tokens; +} + +/** + * Splits a string into tokens for indexing. + */ +function search_index_split($text) { + static $last = null; + static $lastsplit = null; + + if ($last == $text) { + return $lastsplit; + } + // Process words + $text = search_simplify($text); $words = explode(' ', $text); - array_walk($words, '_search_keywords_truncate'); + array_walk($words, '_search_index_truncate'); // Save last keyword result $last = $text; @@ -272,28 +316,13 @@ } /** - * Helper function for array_walk in search_keywords_split. + * Helper function for array_walk in search_index_split. */ -function _search_keywords_truncate(&$text) { +function _search_index_truncate(&$text) { $text = truncate_utf8($text, 50); } /** - * Loosens up a set of search keywords by adding wildcards, if possible. - * - * @param $text - * The keywords as entered by the user. - * @return - * If more wildcards can be added, the adjusted keywords are returned. - * If the query is already as loose as possible, NULL is returned. - */ -function search_keywords_variation($text) { - $text = trim($text); - $new = preg_replace('/\*+/', '*', '*'. implode('* *', explode(' ', trim($text))) .'*'); - return ($new != $text) ? $new : NULL; -} - -/** * Invokes hook_search_preprocess() in modules. */ function search_preprocess(&$text) { @@ -302,7 +331,6 @@ } } - /** * Update the full-text search index for a particular item. * @@ -318,8 +346,9 @@ * @ingroup search */ function search_index($sid, $type, $text) { - $minimum_word_size = variable_get('minimum_word_size', 3); + $minimum_word_size = variable_get('minimum_word_size', 5); + // Link matching global $base_url; $node_regexp = '!href=[\'"]?(?:'. preg_quote($base_url) .'/)?(?:\?q=)?([^\'">]+)[\'">]!i'; @@ -350,6 +379,7 @@ $tag = false; // Odd/even counter. Tag or no tag. $link = false; // State variable for link analyser $score = 1; // Starting score per word + $accum = ' '; // Accumulator for cleaned up data $results = array(0 => array()); @@ -375,7 +405,11 @@ if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) { $linknid = $match[1]; if ($linknid > 0) { - $link = true; + // Note: ignore links to uncachable nodes to avoid redirect bugs. + $format = db_result(db_query('SELECT format FROM {node} WHERE nid = %d', $linknid)); + if (filter_format_allowcache($format)) { + $link = true; + } } } } @@ -386,11 +420,13 @@ else { // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values if ($value != '') { - $words = search_keywords_split($value); + $words = search_index_split($value); foreach ($words as $word) { + $word = drupal_strtolower($word); + // Add word to accumulator + $accum .= $word .' '; // Check wordlength if (drupal_strlen($word) >= $minimum_word_size) { - $word = drupal_strtolower($word); if ($link) { if (!isset($results[$linknid])) { $results[$linknid] = array(); @@ -409,6 +445,9 @@ search_wipe($sid, $type); + // Insert cleaned up data into dataset + db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum); + // Insert results into search index foreach ($results[0] as $word => $score) { db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %d)", $word, $sid, $type, $score); @@ -426,6 +465,147 @@ } /** + * Extract a module-specific search option from a search query. e.g. 'type:book' + */ +function search_query_extract($keys, $option) { + if (preg_match('/(^| )'. $option .':([^ ]*)( |$)/i', $keys, $matches)) { + return $matches[2]; + } +} + +/** + * Return a query with the given module-specific search option inserted in. + * e.g. 'type:book'. + */ +function search_query_insert($keys, $option, $value = '') { + if (search_query_extract($keys, $option)) { + $keys = trim(preg_replace('/(^| )'. $option .':[^ ]*/i', '', $keys)); + } + if ($value != '') { + $keys .= ' '. $option .':'. $value; + } + return $keys; +} + +/** + * Parse a search query into SQL conditions. + * + * We build a query that matches the dataset bodies + */ +function search_parse_query($text) { + $keys = array('positive' => array(), 'negative' => array()); + + // Tokenize query string + preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' '. $text, $matches, PREG_SET_ORDER); + + if (count($matches) < 1) { + return NULL; + } + + // Classify tokens + $or = false; + foreach ($matches as $match) { + // Strip off quotes + if ($match[2]{0} == '"') { + $match[2] = substr($match[2], 1, -1); + } + // Simplify keyword according to indexing rules + $match[2] = search_simplify($match[2]); + // Negative matches + if ($match[1] == '-') { + $keys['negative'][] = $match[2]; + } + // OR operator: instead of a single keyword, we store an array of all + // OR'd keywords. + elseif ($match[2] == 'OR' && count($keys['positive'])) { + $keys['positive'][] = array(array_pop($keys['positive'])); + $or = true; + continue; + } + // Plain keyword + else { + if ($or) { + $keys['positive'][count($keys['positive']) - 1][] = $match[2]; + } + else { + $keys['positive'][] = $match[2]; + } + } + $or = false; + } + + // Convert keywords into SQL statements. + $scorewords = array(); + $query = array(); + $query2 = array(); + $arguments = array(); + $arguments2 = array(); + foreach ($keys['positive'] as $key) { + if (is_array($key) && count($key)) { + $queryor = array(); + foreach ($key as $or) { + $q = _search_parse_query($or, $scorewords); + if ($q) { + $queryor[] = $q; + $arguments[] = $or; + } + } + if (count($queryor)) { + $query[] = '('. implode(' OR ', $queryor) .')'; + } + } + else { + $q = _search_parse_query($key, $scorewords); + if ($q) { + $query[] = $q; + $arguments[] = $key; + } + } + } + foreach ($keys['negative'] as $key) { + $q = _search_parse_query($key, $scorewords, true); + if ($q) { + $query[] = $q; + $arguments[] = $key; + } + } + // We separate word-index conditions because they are not needed in the + // counting query. + if (count($scorewords) == 0) { + $query2[] = "i.word = ''"; + } + else foreach ($scorewords as $word) { + $query2[] = "i.word = '%s'"; + $arguments2[] = $word; + } + $query = implode(' AND ', $query); + $query2 = implode(' OR ', $query2); + return array($query, $arguments, $query2, $arguments2); +} + +/** + * Helper function for search_parse_query(); + */ +function _search_parse_query(&$word, &$scores, $not = false) { + // Check word/phrase + if (drupal_strlen($word) < variable_get('remove_short', 3)) { + return ''; + } + $word = drupal_strtolower($word); + // Determine the scorewords of this word/phrase + if (!$not) { + $split = search_index_split($word); + foreach ($split as $s) { + if (drupal_strlen($s) >= variable_get('minimum_word_size', 5)) { + $scores[] = $s; + } + } + } + // Return matching snippet + return "d.data ". ($not ? 'NOT ' : '') ."LIKE '%% %s %%'"; +} + +/** * Do a query on the full-text search index for a word or words. * * This function is normally only called by each module that support the @@ -434,12 +614,11 @@ * The final query is an SQL select on the search_index table. As a guide for * writing the optional extra SQL fragments (see below), use this query: * - * SELECT i.type, i.sid, i.word, SUM(i.score/t.count) AS score - * FROM {search_index} i - * $join INNER JOIN {search_total} t ON i.word = t.word - * WHERE $where AND (i.word = '...' OR ...) - * GROUP BY i.type, i.sid - * ORDER BY score DESC"; + * SELECT i.type, i.sid, SUM(i.score*t.count) AS score FROM {search_index} i + * INNER JOIN {search_total} t ON i.word = t.word + * $join + * WHERE $conditions + * GROUP BY i.type, i.sid ORDER BY score DESC * * @param $keywords * A search string as entered by the user. @@ -449,83 +628,50 @@ * * @param $join * (optional) A string to be inserted into the JOIN part of the SQL query. - * For example "INNER JOIN {node} n ON n.nid = i.sid". + * For example "INNER JOIN {node} n ON n.nid = d.sid". * * @param $where * (optional) A string to be inserted into the WHERE part of the SQL query. - * For example "(n.status > 0)". + * For example "(n.status > %d)". * - * @param $variation - * Used internally. Must not be specified. + * @param $arguments + * (optional) SQL arguments belonging to the $where parameter. * * @return * An array of SIDs for the search results. * * @ingroup search */ -function do_search($keywords, $type, $join = '', $where = '1', $variation = true) { - // Note, we replace the wildcards with U+FFFD (Replacement character) to pass - // through the keyword extractor. Multiple wildcards are collapsed into one. - $keys = preg_replace('!\*+!', '�', $keywords); +function do_search($keywords, $type, $join = '', $where = '1', $arguments = array()) { // Split into words - $keys = search_keywords_split($keys); - - $words = array(); - $arguments = array(); - $refused = array(); - // Build WHERE clause - foreach ($keys as $word) { - if (drupal_strlen($word) < variable_get('remove_short', 3)) { - if ($word != '') { - $refused[] = str_replace('�', '*', $word); - } - continue; - } - if (strpos($word, '�') !== false) { - $words[] = "i.word LIKE '%s'"; - $arguments[] = str_replace('�', '%', drupal_strtolower($word)); - } - else { - $words[] = "i.word = '%s'"; - $arguments[] = drupal_strtolower($word); - } - } - // Tell the user which words were excluded - if (count($refused) && $variation) { - $message = format_plural(count($refused), - 'The word %words was not included because it is too short.', - 'The words %words were not included because they were too short.'); - drupal_set_message(strtr($message, array('%words' => theme('placeholder', implode(', ', $refused))))); - } + $query = search_parse_query($keywords); - if (count($words) == 0) { + if ($query === NULL || $query[0] == '' || $query[2] == '') { return array(); } - $conditions = $where .' AND ('. implode(' OR ', $words) .')'; - // Get result count (for pager) - $count = db_num_rows(db_query("SELECT DISTINCT i.sid, i.type FROM {search_index} i $join WHERE $conditions", $arguments)); - if ($count == 0) { - // Try out a looser search query if nothing was found. - if ($variation && $loose = search_keywords_variation($keywords)) { - return do_search($loose, $type, $join, $where, false); - } - else { - return array(); - } + // First pass: select all possible matching sids, doing a simple index-based OR matching on the keywords. + // Note: only search_index restrictions. + $conditions = $where .' AND ('. $query[2] .')'; + $arguments = array_merge($arguments, $query[3]); + $result = db_query_temporary("SELECT i.type, i.sid, SUM(i.score*t.count) AS score FROM {search_index} i INNER JOIN {search_total} t ON i.word = t.word $join WHERE $conditions GROUP BY i.type, i.sid ORDER BY score DESC", $arguments, 'temp_search_sids'); + + // Second pass: only keep items that match the complicated keywords conditions (phrase search, negative keywords, ...) + $conditions = '('. $query[0] .')'; + $arguments = $query[1]; + $result = db_query_temporary("SELECT i.type, i.sid, i.score FROM temp_search_sids i INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type WHERE $conditions", $arguments, 'temp_search_results'); + if (($count = db_result(db_query('SELECT COUNT(*) FROM temp_search_results'))) == 0) { + return array(); } $count_query = "SELECT $count"; - // Do pager query - $query = "SELECT i.type, i.sid, SUM(i.score/t.count) AS score FROM {search_index} i $join INNER JOIN {search_total} t ON i.word = t.word WHERE $conditions GROUP BY i.type, i.sid ORDER BY score DESC"; - $result = pager_query($query, 15, 0, $count_query, $arguments); - + // Do actual search query + $result = pager_query("SELECT sid, type, score FROM temp_search_results", 10, 0, $count_query, $arguments); $results = array(); while ($item = db_fetch_object($result)) { $results[] = $item->sid; } - return $results; } @@ -548,11 +694,12 @@ // Search form submits with POST but redirects to GET. This way we can keep // the search query URL clean as a whistle: // search/type/keyword+keyword - if ($_POST['edit']['keys']) { + if (isset($_POST['op'])) { if ($type == '') { $type = 'node'; } - drupal_goto('search/'. urlencode($type) .'/'. urlencode($_POST['edit']['keys'])); + $keys = module_invoke($type, 'search', 'post', $_POST['edit']['keys']); + drupal_goto('search/'. urlencode($type) .'/'. urlencode(is_null($keys) ? $_POST['edit']['keys'] : $keys)); } else if ($type == '') { // Note: search/node can not be a default tab because it would take on the @@ -658,6 +805,7 @@ $box .= form_submit(t('Search')); $box .= ''; $output .= form_item($prompt, $box); + $output .= module_invoke($type, 'search', 'form', $keys); $output .= ''; return form($output, 'post', $action); @@ -691,8 +839,7 @@ * Used for formatting search results. * * @param $keys - * A string containing keywords. They are split into words using the same - * rules as search indexing. + * A string containing a search query. * * @param $text * The text to extract fragments from. @@ -701,7 +848,11 @@ * A string containing HTML for the excerpt. */ function search_excerpt($keys, $text) { - $keys = search_keywords_split($keys); + // Extract positive keywords and phrases + preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/i', ' '. $keys, $matches); + $keys = array_merge($matches[2], $matches[3]); + + // Prepare text $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text)); array_walk($keys, '_search_excerpt_replace'); $workkeys = $keys;