diff --git search_api_xapian/search_api_xapian.info search_api_xapian/search_api_xapian.info new file mode 100644 index 0000000..a34cd67 --- /dev/null +++ search_api_xapian/search_api_xapian.info @@ -0,0 +1,11 @@ +; $Id$ + +name = Xapian integration for Search API +description = Offers an implementation of the Search API that uses an Xapian for indexing content. +core = 7.x +package = Search +dependencies[] = search_api + +files[] = search_api_xapian.install +files[] = search_api_xapian.module +files[] = service.inc diff --git search_api_xapian/search_api_xapian.install search_api_xapian/search_api_xapian.install new file mode 100644 index 0000000..4d375f9 --- /dev/null +++ search_api_xapian/search_api_xapian.install @@ -0,0 +1,84 @@ + $version, '%minimum' => SEARCH_API_XAPIAN_MINIMUM_BINDINGS)); + } + else { + $severity = REQUIREMENT_OK; + } + } + else { + $severity = REQUIREMENT_ERROR; + $version = $t('You need to install Xapian bindings version >= %minimum. %error', array('%minimum' => SEARCH_API_XAPIAN_MINIMUM_BINDINGS, '%error' => $GLOBALS['search_api_xapian_include_error'])); + } + + return array( + 'xapian' => array( + 'title' => $t('Xapian bindings'), + 'value' => $version, + 'severity' => $severity, + ), + ); +} + +/** + * Provide default error handler for xapian + * + * @param int $errno + * @param string $errstr + * @param string $errfile + * @param int $errline + * @param unknown_type $errcontext + */ +function search_api_xapian_requirements_error_handler($errno, $errstr, $errfile = NULL, $errline = NULL, $errcontext = NULL) { + $GLOBALS['search_api_xapian_include_error'] = $errstr; +} + +/** + * Attempt to include xapian.php. If there are errors, Xapian is not + * available, otherwise it is available. + * + * @return bool + */ +function xapian_available() { + $available = &drupal_static(__FUNCTION__, FALSE); + + if ($available === FALSE) { + $GLOBALS['search_api_xapian_include_error'] = NULL; + set_error_handler('search_api_xapian_requirements_error_handler'); + include_once('xapian.php'); + restore_error_handler(); + + if (NULL == $GLOBALS['search_api_xapian_include_error']) { + $available = TRUE; + } + } + + return $available; +} + +/** + * Get the version string of Xapian. + */ +function xapian_get_version() { + return (function_exists('xapian_version_string')) ? xapian_version_string() : Xapian::version_string(); +} diff --git search_api_xapian/search_api_xapian.module search_api_xapian/search_api_xapian.module new file mode 100644 index 0000000..ab3ce41 --- /dev/null +++ search_api_xapian/search_api_xapian.module @@ -0,0 +1,20 @@ + t('Xapian local service'), + 'description' => t('Service for Xapian search engine using a local Xapian database.'), + 'class' => 'SearchApiXapianLocalService', + ); + $services['search_api_xapian_service'] = array( + 'name' => t('Xapian server service'), + 'description' => t('Service for Xapian search engine using a remote Xapian database.'), + 'class' => 'SearchApiXapianServerService', + ); + + return $services; +} diff --git search_api_xapian/service.inc search_api_xapian/service.inc new file mode 100644 index 0000000..a9aadb6 --- /dev/null +++ search_api_xapian/service.inc @@ -0,0 +1,584 @@ +database)) { + return $this->database; + } + if ($writable && is_object($this->writable_database)) { + return $this->writable_database; + } + } + + /** + * Helper to get the right xapian stemmer object. + */ + public function getStemmer() { + if (!is_object($this->stemmer)) { + if (isset($this->options['xapian_stem_language'])) { + $language = $this->options['xapian_stem_language']; + } + else { + $language = 'english'; + } + try { + $this->stemmer = new XapianStem($language); + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error occurred getting the stem with language !language: !msg.', array('!msg' => $e->getMessage(), '!language' => $language))); + return NULL; + } + } + return $this->stemmer; + } + + /** + * Helper to get the right xapian term generator(aka indexer) object. + */ + public function getIndexer() { + if (!is_object($this->indexer)) { + try { + $this->indexer = new XapianTermGenerator(); + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error occurred getting the indexer: !msg.', array('!msg' => $e->getMessage()))); + return NULL; + } + } + try { + $this->indexer->set_stemmer($this->getStemmer()); + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error occurred setting the stemmer: !msg.', array('!msg' => $e->getMessage()))); + return NULL; + } + return $this->indexer; + } + + /** + * Write all the pending changes to the xapian database. + */ + public function flushDatabase() { + if (is_object($this->writable_database)) { + try { + $this->writable_database->flush(); + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error flushing the database: !msg.', array('!msg' => $e->getMessage()))); + return NULL; + } + } + } + + /** + * Convinience method to get a unique item identifier inside the xapian + * database. + * + * @param int $index_id + * The actual index identificator. + * @param int $item_id + * The unique item identifier. + */ + public function getUniqueId($index_id, $item_id) { + return sprintf('%sindex%ditem_id%d', self::DOCID_PREFIX, $index_id, $item_id); + } + + /** + * Helper for getting the right count. + * + * @param XapianMSet $matches + */ + public function getMatchesCount(XapianMSet $matches) { + $count_type = NULL; + if (isset($this->options['xapian_count_type'])) { + $count_type = $this->options['xapian_count_type']; + } + + try { + switch ($count_type) { + case self::MATCHES_LOWER_BOUND: + $count = $matches->get_matches_lower_bound(); + break; + + case self::MATCHES_UPPER_BOUND: + $count = $matches->get_matches_upper_bound(); + break; + + case self::MATCHES_BEST_ESTIMATE: + default: + $count = $matches->get_matches_estimated(); + break; + } + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error getting the matches count: !msg.', array('!msg' => $e->getMessage()))); + return NULL; + } + + return $count; + } + + /** + * Helper to generate select form about xapian languages. + */ + function getXapianLanguagesOptions() { + try { + $list = explode(' ', XapianStem::get_available_languages()); + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error getting the language options: !msg.', array('!msg' => $e->getMessage()))); + return NULL; + } + + foreach ($list as $language) { + $languages[$language] = drupal_ucfirst($language); + } + return $languages; + } + + /** + * Flatten a keys array into a single search string. + * + * @param array $keys + * The keys array to flatten, formatted as specified by + * SearchApiQueryInterface::getKeys(). + * + * @return string + * A Xapian query string representing the same keys. + */ + protected function flattenKeys(array $keys) { + $k = array(); + foreach (element_children($keys) as $i) { + $key = $keys[$i]; + if (!$key) { + continue; + } + if (is_array($key)) { + $k[] = $this->flattenKeys($key); + } + else { + $key = trim($key); + if (strpos($key, ' ') !== FALSE) { + $key = '"' . $key .'"'; + } + $k[] = $key; + } + } + if (!$k) { + return ''; + } + if ($keys['#conjunction'] == 'OR') { + $k = '((' . implode(') OR (', $k) . '))'; + return empty($keys['#negation']) ? $k : '-' . $k; + } + if ($keys['#conjunction'] == 'AND') { + $k = '((' . implode(') AND (', $k) . '))'; + return empty($keys['#negation']) ? $k : '-' . $k; + } + + $k = implode(' ', $k); + return empty($keys['#negation']) ? $k : '-(' . $k . ')'; + } + + /** + * Helper method for indexing a field into a document. + * + * @param XapianDocument $document + * A xapian document object passed by reference where the field belongs to. + * @param XapianTermGenerator $indexer + * The xapian indexer that is going to be used to index the field. + * @param array $field + * A field array as received inside each item of $items parameter on SearchApiServiceInterface::indexItems(). + */ + protected function indexField(&$document, &$indexer, $field) { + $value = $field['value']; + $type = $field['type']; + if (search_api_is_list_type($type)) { + $type = substr($type, 5, -1); + foreach ($value as $v) { + $this->indexField($document, $indexer, $v); + } + return; + } + $weight = (int)($field['boost']); + try { + switch ($type) { + case 'tokens': + foreach ($value as $v) { + $indexer->index_text($v['value'], $weight); + } + return; + case 'boolean': + $value = $value ? 'true' : 'false'; + break; + case 'date': + $value = is_numeric($value) ? (int) $value : strtotime($value); + if ($value === FALSE) { + return; + } + // xapian require a string + $value = (string) $value; + break; + } + $document->add_term($value, $weight); + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error indexing the field: !msg.', array('!msg' => $e->getMessage()))); + } + } + + public function configurationForm() { + // See http://xapian.org/docs/stemming.html + $form['xapian_stem_language'] = array( + '#type' => 'select', + '#title' => t('Stemming language'), + '#options' => $this->getXapianLanguagesOptions(), + '#default_value' => empty($this->options['xapian_stem_language']) ? 'english' : $this->options['xapian_stem_language'], + '#description' => t('Select the language that Xapian should use when deriving the stem of each word when building an index.'), + ); + $form['xapian_count_type'] = array( + '#type' => 'radios', + '#title' => t('Result count'), + '#description' => t('This setting determines the value that xapian returns for the result count returned from queries (used for number of pages in pagers, etc.)'), + '#default_value' => empty($this->options['xapian_count_type']) ? self::MATCHES_BEST_ESTIMATE : $this->options['xapian_count_type'], + '#options' => array( + self::MATCHES_BEST_ESTIMATE => t('Best estimate'), + self::MATCHES_LOWER_BOUND => t('Lower bound'), + self::MATCHES_UPPER_BOUND => t('Upper bound'), + ), + ); + return $form; + } + + public function indexItems(SearchApiIndex $index, array $items) { + + $ret = array(); + $index_id = $index->id; + + try { + $database = $this->getDatabase(TRUE); + foreach ($items as $id => $item) { + $document = new XapianDocument(); + $indexer = $this->getIndexer(); + $indexer->set_document($document); + // some ways to identify this document: all, index and item + $document->add_boolean_term('search_api_xapian_server'); + $document->add_term('index' . $index->id); + $document->add_term('item_id' . $id); + $document->set_data($id); + + foreach ($item as $key => $field) { + $this->indexField($document, $indexer, $field); + } + + $unique_id = $this->getUniqueId($index->id, $id); + $database->replace_document($unique_id, $document); + $ret[] = $id; + } + $this->flushDatabase(); + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error occurred while indexing: !msg.', array('!msg' => $e->getMessage()))); + return array(); + } + + return $ret; + } + + public function deleteItems($ids = 'all', SearchApiIndex $index = NULL) { + $database = $this->getDatabase(TRUE); + try { + if ($index) { + if (is_array($ids)) { + // delete mentioned items + foreach ($ids as $id) { + $database->delete_document($this->getUniqueId($index->id, $id)); + } + } + else { + // delete all + $database->delete_document('index' . $index->id); + } + } + else { + // delete all + $database->delete_document('search_api_xapian_server'); + } + $this->flushDatabase(); + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error occurred while deleting: !msg.', array('!msg' => $e->getMessage()))); + } + } + + public function search(SearchApiQueryInterface $query) { + $start_time = microtime(TRUE); + + try { + // 1. Prepare the search + $preprocess_start = microtime(TRUE); + $keys = $query->getKeys(); + if (is_array($keys)) { + $keys = $this->flattenKeys($keys); + } + $stemmer = $this->getStemmer(); + $database = $this->getDatabase(); + $enquire = new XapianEnquire($database); + $query_parser = new XapianQueryParser(); + $query_parser->set_stemmer($stemmer); + $query_parser->set_database($database); + $query_parser->set_stemming_strategy(XapianQueryParser::STEM_SOME); //TODO this should be an option + $realQuery = $query_parser->parse_query($keys); + $start = $query->getOption('offset') === NULL ? 0 : $query->getOption('offset'); + // a lenght = 0 gives us an empty MSet with valid statistics calculated + // without looking at any postings, which is very quick, but means the + // estimates may be more approximate and the bounds may be much looser. + $length = $query->getOption('limit') === NULL ? 0 : $query->getOption('limit'); + $preprocess_end = microtime(TRUE); + + // 2. Execute the search + $execution_start = microtime(TRUE); + $enquire->set_query($realQuery); + $matches = $enquire->get_mset((int)$start, (int)$length); + $execution_end = microtime(TRUE); + + // 3. Extract results + $postprocessing_start = microtime(TRUE); + $results = array(); + $results['result count'] = $this->getMatchesCount($matches); + $results['results'] = array(); + + $i = $matches->begin(); + while (!$i->equals($matches->end())) { + $document = $i->get_document(); + $id = $document->get_data(); + $results['results'][$id] = array( + 'id' => $id, + 'score' => (int)($i->get_percent()), + ); + $i->next(); + } + $postprocessing_end = microtime(TRUE); + + // Compute performance + $end_time = microtime(TRUE); + $results['performance'] = array( + 'complete' => $end_time - $start_time, + 'preprocessing' => $preprocess_end - $preprocess_start, + 'execution' => $execution_end - $execution_end, + 'postprocessing' => $postprocessing_end - $postprocessing_start, + ); + + return $results; + } + catch (Exception $e) { + throw new SearchApiException($e->getMessage()); + } + } + +} + +/** + * Search service class using Xapian library on a local directory. + */ +class SearchApiXapianLocalService extends SearchApiAbstractXapianService { + + /** + * Local directory where xapian database is stored. + */ + protected $path = NULL; + + public function getDatabase($writable = FALSE) { + $ret_database = parent::getDatabase($writable); + if (is_object($ret_database)) { + return $ret_database; + } + + if (is_null($this->path)) { + if (isset($this->options['xapian_database_path'])) { + $this->path = drupal_realpath($this->options['xapian_database_path']); + } + else { + watchdog('search_api_xapian', 'No database path given for server with id = %server.', array('%server' => $this->id)); + return NULL; + } + } + + try { + if ($writable) { + $this->writable_database = new XapianWritableDatabase($this->path, Xapian::DB_CREATE_OR_OPEN); + return $this->writable_database; + } + else { + $this->database = new XapianDatabase($this->path); + return $this->database; + } + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error occurred getting the !writable database : !msg.', array('!msg' => $e->getMessage(), '!writable' => ($writable ? 'writable': 'non-writable')))); + return NULL; + } + } + + public function configurationForm() { + $form = parent::configurationForm(); + $form['xapian_database_path'] = array( + '#type' => 'textfield', + '#title' => t('Path to database'), + '#default_value' => empty($this->options['xapian_database_path']) ? file_default_scheme() . '://xapian_database' : $this->options['xapian_database_path'], + '#required' => TRUE, + '#description' => t('Directory where your local Xapian database will be created. Specify a directory writable by your web server process.'), + ); + return $form; + } + + public function configurationFormValidate(array $form, array &$values, $prefix = '') { + parent::configurationFormValidate($form, $values, $prefix); + // TODO check a valid drupal schema path + } + +} + +/** + * Search service class using Xapian library on a remote server. + */ +class SearchApiXapianServerService extends SearchApiAbstractXapianService { + + /** + * Hostname where xapian-tcpsrv is running. + */ + public $hostname = NULL; + + /** + * Port where xapian-tcpsrv is running. + */ + public $port = NULL; + + public function getDatabase($writable = FALSE) { + $ret_database = parent::getDatabase($writable); + if (is_object($ret_database)) { + return $ret_database; + } + + if (is_null($this->hostname) || is_null($this->port)) { + if (isset($this->options['xapian_database_hostname']) && isset($this->options['xapian_database_port'])) { + $this->hostname = $this->options['xapian_database_hostname']; + $this->port = $this->options['xapian_database_port']; + } + else { + watchdog('search_api_xapian', 'No database hostname or port given for server with id = %server.', array('%server' => $this->id)); + return NULL; + } + } + + try { + if ($writable) { + $this->writable_database = Xapian::remote_open_writable($this->hostname, (int)$this->port); + return $this->writable_database; + } + else { + $db_source = Xapian::remote_open($this->hostname, (int)$this->port); + $this->database = new XapianDatabase($db_source); + return $this->database; + } + } + catch (Exception $e) { + watchdog('search_api_xapian', t('An error occurred getting the !writable database : !msg.', array('!msg' => $e->getMessage(), '!writable' => ($writable ? 'writable': 'non-writable')))); + return NULL; + } + + } + + public function configurationForm() { + $form = parent::configurationForm(); + $form['xapian_database_hostname'] = array( + '#type' => 'textfield', + '#title' => t('Database server'), + '#default_value' => '', + '#required' => TRUE, + '#description' => t('IP address or host name of remote server running xapian-tcpsrv.'), + ); + $form['xapian_database_port'] = array( + '#type' => 'textfield', + '#title' => t('Database port'), + '#default_value' => '6431', + '#required' => TRUE, + '#description' => t('Remote port that xapian-tcpsrv is listening on.'), + ); + return $form; + } + + public function configurationFormValidate(array $form, array &$values, $prefix = '') { + parent::configurationFormValidate($form, $values, $prefix); + if (!is_numeric($values['xapian_database_port']) || (int)$values['xapian_database_port'] < 1 || (int)$values['xapian_database_port'] > 65535) { + form_set_error($prefix . 'xapian_database_port', t('%value is not a valid port.', array('%value' => $values['xapian_database_port']))); + } + } + +}