Index: modules/search/search.admin.inc =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.admin.inc,v retrieving revision 1.8 diff -u -p -r1.8 search.admin.inc --- modules/search/search.admin.inc 11 Jan 2009 21:19:18 -0000 1.8 +++ modules/search/search.admin.inc 11 Jun 2009 05:41:04 -0000 @@ -86,6 +86,20 @@ function search_admin_settings() { '#default_value' => TRUE, '#description' => t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Turn this off if you want to use an external preprocessor for this instead. Does not affect other languages.') ); + + // Create the options array + $n_gram_options[0] = t('Full Word Matches'); + foreach (range(1, 6) as $size) { + $n_gram_options[$size] = t('@num Character size', array('@num' => $size)); + } + + $form['indexing_settings']['search_gram_size'] = array( + '#type' => 'select', + '#title' => t('Character Matches'), + '#description' => t('This allows for partial word matching in searches. It breaks a word down in a several N sized character strings. Set it to Full Word Matches if you do not want this feature. The smaller the number of characters the more accurate the results but at a larger database cost.'), + '#options' => $n_gram_options, + '#default_value' => variable_get('search_gram_size', 0), + ); $form['#validate'] = array('search_admin_settings_validate'); @@ -101,9 +115,14 @@ function search_admin_settings_validate( if ($form_state['values']['op'] == t('Re-index site')) { drupal_goto('admin/settings/search/wipe'); } + // only allow gram sizes that are smaller or equal to the min word size + if ($form_state['values']['minimum_word_size'] < $form_state['values']['search_gram_size']) { + form_set_error('search_gram_size', t('N-Gram size must be less then or equal to the minimum word size')); + } // If these settings change, the index needs to be rebuilt. - if ((variable_get('minimum_word_size', 3) != $form_state['values']['minimum_word_size']) || - (variable_get('overlap_cjk', TRUE) != $form_state['values']['overlap_cjk'])) { + else if ((variable_get('minimum_word_size', 3) != $form_state['values']['minimum_word_size']) || + (variable_get('overlap_cjk', TRUE) != $form_state['values']['overlap_cjk']) || + (variable_get('search_gram_size', 0) != $form_state['values']['search_gram_size'])) { drupal_set_message(t('The index will be rebuilt.')); search_wipe(); } Index: modules/search/search.module =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.module,v retrieving revision 1.298 diff -u -p -r1.298 search.module --- modules/search/search.module 3 Jun 2009 06:52:29 -0000 1.298 +++ modules/search/search.module 11 Jun 2009 05:41:05 -0000 @@ -321,6 +321,11 @@ function search_simplify($text) { // Call an external processor for word handling. search_invoke_preprocess($text); + + // Check if N-Gram is enabled + if ($size = variable_get('search_gram_size', 0)) { + $text = search_gram_words($text, $size); + } // Simple CJK handling if (variable_get('overlap_cjk', TRUE)) { @@ -347,6 +352,29 @@ function search_simplify($text) { } /** + * Provides N-Gramming functionality. + * + * @param string $text + * single piece of plain-text that was extracted from between two HTML tags. Will not contain any HTML entities + * + * @param int $gram_size + * The size of the N-Gram + * + * @return string + * processed string into N-Gram words + */ +function search_gram_words($text, $gram_size) { + // step through each of the text + // and add it modified text as space deliminated + $words = array(); + for($start = 0; $start + $gram_size <= drupal_strlen($text); $start++) { + $words[] = drupal_substr($text, $start, $gram_size); + } + // implode the text so its 'words' + return implode(" ", $words); +} + +/** * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping * sequences of characters ('minimum_word_size' long). */ @@ -1246,7 +1274,7 @@ function search_excerpt($keys, $text) { } // Locate a keyword (position $p), then locate a space in front (position // $q) and behind it (position $s) - if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) { + if (preg_match('/' . $boundary .'[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $key . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) { $p = $match[0][1]; if (($q = strpos($text, ' ', max(0, $p - 60))) !== FALSE) { $end = substr($text, $p, 80); @@ -1304,7 +1333,7 @@ function search_excerpt($keys, $text) { $text = (isset($newranges[0]) ? '' : '... ') . implode(' ... ', $out) . ' ...'; // Highlight keywords. Must be done at once to prevent conflicts ('strong' and ''). - $text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '\0', $text); + $text = preg_replace('/' . $boundary . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . '(' . implode('|', $keys) . ')' . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $boundary . '/iu', '\0', $text); return $text; }