From 7a1bfd0b3c6c68b088542c889bef8b66f4c5d932 Mon Sep 17 00:00:00 2001 From: Florent Torregrosa Date: Tue, 14 Apr 2015 16:36:49 +0200 Subject: [PATCH] Issue #2470553: WIP extract with SolR. --- .../processor/FilesFieldsProcessorPlugin.php | 292 +++++++++++++++++++- 1 file changed, 291 insertions(+), 1 deletion(-) diff --git a/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php b/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php index 8fd68d1..4794e98 100644 --- a/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php +++ b/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php @@ -6,6 +6,12 @@ use Drupal\Core\TypedData\DataDefinition; use Drupal\field\Entity\FieldConfig; use Drupal\search_api\Datasource\DatasourceInterface; use Drupal\search_api\Processor\ProcessorPluginBase; +use Drupal\Core\StreamWrapper\StreamWrapperInterface; +use Drupal\Component\Utility\Xss; +use Drupal\search_api\Entity\Server; +use Drupal\search_api_solr\Plugin\search_api\backend\SearchApiSolrBackend; +use Drupal\search_api_solr\Utility; +use SebastianBergmann\Exporter\Exception; /** * @SearchApiProcessor( @@ -46,11 +52,55 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { if (!($field = $item->getField('search_api_attachments_' . $field_name))) { continue; } - $field->addValue('test test'); + + // Need to retrieve the files. + $entity = $item->getOriginalObject()->getValue(); + $filefield_values = $entity->get($field_name)->getValue(); + + $fids = array(); + foreach ($filefield_values as $filefield_value) { + $fids[] = $filefield_value['target_id']; + } + + // Retrieve the files. + $files = entity_load_multiple('file', $fids); + + // Parse the files. + $extraction = ''; + foreach ($files as $file) { + if (file_exists($this->get_realpath($file))) { + if (in_array($file->getMimeType(), array('text/plain', 'text/x-diff'))) { + $extraction .= $this->extract_simple($file); + } + elseif (in_array($file->getMimeType(), array('image/jpeg', 'image/jpg', 'image/tiff'))) { + $extraction .= $this->extract_exif($file); + } + else { +// $extraction_method = variable_get('search_api_attachments_extract_using', 'tika'); +// // Send the extraction request to the right place depending on the +// // current setting. +// if ($extraction_method == 'tika') { +// $extraction .= $this->extract_tika($file); +// } +// else { + $extraction = $this->extract_solr($file); +// } + } + } + } + + // Add the value in the indexed data. + $field->addValue($extraction); } } } + /** + * Helper function. + * + * @return array + * An array of file fields. + */ protected function getFileFields() { $file_fields = array(); // Retrieve file fields of indexed bundles. @@ -66,4 +116,244 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { return $file_fields; } + /** + * Extract txt file. + * + * @param $file + * A file object. + * @return string + * The text extracted from the txt file. + */ + protected function extract_simple($file) { + $text = file_get_contents($this->get_realpath($file)); + $text = iconv("UTF-8", "UTF-8//IGNORE", $text); + $text = Xss::filter(str_replace(array('<', '>'), array(' <', '> '), $text), array()); + $text = htmlspecialchars(html_entity_decode($text, ENT_NOQUOTES, 'UTF-8'), ENT_NOQUOTES, 'UTF-8'); + $text = trim($text); + return $text; + } + + /** + * Extract img file. + * + * @param $file + * A file object. + * @return string + * The text extracted from the txt file. + */ + protected function extract_exif($file) { + $ret = ''; + $size = getimagesize($this->get_realpath($file), $info); + if (isset($info['APP13'])) { + $iptc = iptcparse($info['APP13']); + if (is_array($iptc)) { + foreach ($iptc as $key => $value) { + foreach ($value as $innerkey => $innervalue) { + $ret .= $innervalue . ' '; + } + } + } + } + return $ret; + } + + /** + * Extract file with Tika library. + * + * @param $file + * A file object that implements FileInterface. + * + * @return string + * The text extracted from the file. + * @throws \Drupal\search_api_attachments\Plugin\search_api\processor\Exception + */ + protected function extract_tika($file) { + $filepath = $this->get_realpath($file); + // TODO: Remove the hardcoded path to tika. + $tika_path = realpath('/var/apache-tika'); + // TODO: Remove the hardcoded tika jar filename. + $tika = realpath($tika_path . '/tika-app-1.7.jar'); + if (!($tika) || !is_file($tika)) { + throw new Exception(t('Invalid path or filename for tika application jar.')); + } + // UTF-8 multibyte characters will be stripped by escapeshellargs() for the + // default C-locale. + // So temporarily set the locale to UTF-8 so that the filepath remains valid. + $backup_locale = setlocale(LC_CTYPE, '0'); + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $param = ''; + if ($file->getMimeType() != 'audio/mpeg') { + $param = ' -Dfile.encoding=UTF8 -cp ' . escapeshellarg($tika_path); + } + + // Force running the Tika jar headless. + $param = ' -Djava.awt.headless=true ' . $param; + + $cmd = escapeshellcmd('java') . $param . ' -jar ' . escapeshellarg($tika) . ' -t ' . escapeshellarg($filepath); + if (strpos(ini_get('extension_dir'), 'MAMP/')) { + $cmd = 'export DYLD_LIBRARY_PATH=""; ' . $cmd; + } + // Restore the locale. + setlocale(LC_CTYPE, $backup_locale); + // Support UTF-8 commands: http://www.php.net/manual/en/function.shell-exec.php#85095 + shell_exec("LANG=en_US.utf-8"); + return shell_exec($cmd); + } + + /** + * Extract data using Solr (via the ExtractingRequestHandler) or using the + * remote Tika servlet. + * @see http://wiki.apache.org/solr/ExtractingRequestHandler. + * @see http://wiki.apache.org/tika/TikaJAXRS. + */ + protected function extract_solr($file) { + $extraction = FALSE; + $filepath = $this->get_realpath($file); + + try { + $filename = basename($filepath); + + // Load all enabled, not read-only indexes. + $conditions = array( + 'status' => TRUE, + ); + + // @var \Drupal\search_api\Server + $search_api_servers = entity_load_multiple_by_properties('search_api_server', $conditions); + + // TODO: multiple server instances. + if (count($search_api_servers) == 1) { + + $server_name = key($search_api_servers); // TODO: Do it in a better way. + $server = Server::load($server_name); + + // Check that it is a solr server. + if ($server->getBackend()->getPluginId() == 'search_api_solr') { + // Check the connection to the solr server. + if ($server->getBackend()->ping()) { + // Get solarium client. + $solarium_client = $server->getBackend()->getSolrConnection(); + + // Prepare the extraction options. + // Heavily inspired by apachesolr_attachments. + // Construct a multi-part form-data POST body in $data. + $boundary = '--' . md5(uniqid(REQUEST_TIME)); + $data = "--{$boundary}\r\n"; + // The 'filename' used here becomes the property name in the response. + $data .= 'Content-Disposition: form-data; name="file"; filename="extracted"'; + $data .= "\r\nContent-Type: application/octet-stream\r\n\r\n"; + $data .= file_get_contents($filepath); + $data .= "\r\n--{$boundary}--\r\n"; + $headers = array('Content-Type' => 'multipart/form-data; boundary=' . $boundary); + $options = array( + 'method' => 'POST', + 'headers' => $headers, + 'data' => $data, + ); + $options = array(); + + // Create extract query. + $extract_query = $solarium_client->createExtract($options); + $extract_query->addFieldMapping('fmap.content', 'text'); +// $extract_query->addFieldMapping('invariants', 'text'); +// $extract_query->setUprefix('attr_'); + $extract_query->setCommit(FALSE); + $extract_query->setOmitHeader(FALSE); + + $extract_query->setFile($filepath); + + // add document + $doc = $extract_query->createDocument(); + $doc->id = 'extract-test'; + $doc->some = 'more fields'; + $extract_query->setDocument($doc); + + $extract = $solarium_client->extract($extract_query); +// $extract_result = $server->getBackend()->extractResult($extract_query, $extract); + +// http://wiki.solarium-project.org/index.php/V3:Extract_query#Building_an_extract_query + +// // get an extract query instance and add settings +// $query = $solarium_client->createExtract(); +// $query->addFieldMapping('content', 'text'); +// $query->setUprefix('attr_'); +// $query->setFile('/home/florent/sites/drupal8/sites/default/files/banana.txt'); +// $query->setCommit(true); +// $query->setOmitHeader(false); +// +// // add document +// $doc = $query->createDocument(); +// $doc->id = 'extract-test'; +// $doc->some = 'more fields'; +// $query->setDocument($doc); +// +// // this executes the query and returns the result +// $result = $solarium_client->extract($query); + +// echo 'Extract query executed
'; +// echo 'Query status: ' . $result->getStatus(). '
'; +// echo 'Query time: ' . $result->getQueryTime(); + + +// // Heavily inspired by apachesolr_file. +// // @see apachesolr_file_extract(). +// // Construct a multi-part form-data POST body in $data. +// $boundary = '--' . md5(uniqid(REQUEST_TIME)); +// $data = "--{$boundary}\r\n"; +// // The 'filename' used here becomes the property name in the response. +// $data .= 'Content-Disposition: form-data; name="file"; filename="extracted"'; +// $data .= "\r\nContent-Type: application/octet-stream\r\n\r\n"; +// $data .= file_get_contents('/home/trousselin/Desktop/ddd.pdf'); +// $data .= "\r\n--{$boundary}--\r\n"; +// $headers = array('Content-Type' => 'multipart/form-data; boundary=' . $boundary); +// $options = array( +// 'method' => 'POST', +// 'headers' => $headers, +// 'data' => $data, +// ); +// +// // Make a servlet request using the solr connection. +// $response = $solr_connection->makeServletRequest($servlet_path, $params, $options); +// +// // If we have an extracted response, all is well. +// if (isset($response->extracted)) { +// $extraction = $response->extracted; +// } + } + } + } + } + catch (Exception $e) { + // Log the exception to watchdog. Exceptions from Solr may be transient, + // or indicate a problem with a specific file. + $message = t('Exception occurred sending %filepath to Solr.', array('%filepath' => $file->getFileUri())); + \Drupal::logger('search_api_attachments')->error($message); + watchdog_exception('search_api_attachments', $e); + } + + return $extraction; + } + + /** + * Helper function. + * + * @param $file + * A file object. + * @return mixed + * The real path to the file if it is a local file. An URL otherwise. + */ + protected function get_realpath($file) { + $uri = $file->getFileUri(); + $wrapper = \Drupal::service('stream_wrapper_manager')->getViaUri($uri); + $scheme = file_uri_scheme($uri); + $local_wrappers = \Drupal::service('stream_wrapper_manager')->getWrappers(StreamWrapperInterface::LOCAL); + if (in_array($scheme, array_keys($local_wrappers))) { + return $wrapper->realpath(); + } + else { + return $wrapper->getExternalUrl(); + } + } + } -- 1.7.10.4