From 784e802c237e26f1f101c837bd3498ac2a6f38ff Mon Sep 17 00:00:00 2001 From: Florent Torregrosa Date: Tue, 14 Apr 2015 16:36:49 +0200 Subject: [PATCH 1/2] Issue #2470553: Allows search_api_attachments to parse txt, img files and other files with tika. --- .../processor/FilesFieldsProcessorPlugin.php | 152 +++++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) diff --git a/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php b/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php index 8fd68d1..18ce450 100644 --- a/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php +++ b/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php @@ -6,6 +6,8 @@ use Drupal\Core\TypedData\DataDefinition; use Drupal\field\Entity\FieldConfig; use Drupal\search_api\Datasource\DatasourceInterface; use Drupal\search_api\Processor\ProcessorPluginBase; +use Drupal\Core\StreamWrapper\StreamWrapperInterface; +use Drupal\Component\Utility\Xss; /** * @SearchApiProcessor( @@ -46,11 +48,55 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { if (!($field = $item->getField('search_api_attachments_' . $field_name))) { continue; } - $field->addValue('test test'); + + // Need to retrieve the files. + $entity = $item->getOriginalObject()->getValue(); + $filefield_values = $entity->get($field_name)->getValue(); + + $fids = array(); + foreach ($filefield_values as $filefield_value) { + $fids[] = $filefield_value['target_id']; + } + + // Retrieve the files. + $files = entity_load_multiple('file', $fids); + + // Parse the files. + $extraction = ''; + foreach ($files as $file) { + if (file_exists($file->getFileUri())) { + if (in_array($file->getMimeType(), array('text/plain', 'text/x-diff'))) { + $extraction .= $this->extract_simple($file); + } + elseif (in_array($file->getMimeType(), array('image/jpeg', 'image/jpg', 'image/tiff'))) { + $extraction .= $this->extract_exif($file); + } + else { +// $extraction_method = variable_get('search_api_attachments_extract_using', 'tika'); +// // Send the extraction request to the right place depending on the +// // current setting. +// if ($extraction_method == 'tika') { + $extraction .= $this->extract_tika($file); +// } +// else { +// $extraction .= $this->extract_solr($file); +// } + } + } + } + + // Add the value in the indexed data. + $field->addValue($extraction); } } } + /** + * Helper function. + * + * @return array + * An array of file fields. + */ protected function getFileFields() { $file_fields = array(); // Retrieve file fields of indexed bundles. @@ -66,4 +112,108 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { return $file_fields; } + /** + * Extract txt file. + * + * @param $file + * A file object. + * @return string + * The text extracted from the txt file. + */ + protected function extract_simple($file) { + $text = file_get_contents($this->get_realpath($file->getFileUri())); + $text = iconv("UTF-8", "UTF-8//IGNORE", $text); + $text = Xss::filter(str_replace(array('<', '>'), array(' <', '> '), $text), array()); + $text = htmlspecialchars(html_entity_decode($text, ENT_NOQUOTES, 'UTF-8'), ENT_NOQUOTES, 'UTF-8'); + $text = trim($text); + return $text; + } + + /** + * Extract img file. + * + * @param $file + * A file object. + * @return string + * The text extracted from the txt file. + */ + protected function extract_exif($file) { + $ret = ''; + $size = getimagesize($this->get_realpath($file->getFileUri()), $info); + if (isset($info['APP13'])) { + $iptc = iptcparse($info['APP13']); + if (is_array($iptc)) { + foreach ($iptc as $key => $value) { + foreach ($value as $innerkey => $innervalue) { + $ret .= $innervalue . ' '; + } + } + } + } + return $ret; + } + + /** + * Extract file with Tika library. + * + * @param $file + * A file object that implements FileInterface. + * + * @return string + * The text extracted from the file. + * @throws \Drupal\search_api_attachments\Plugin\search_api\processor\Exception + */ + protected function extract_tika($file) { + $filepath = $this->get_realpath($file->getFileUri()); + // TODO: Remove the hardcoded path to tika. + $tika_path = realpath('/var/apache-tika'); + // TODO: Remove the hardcoded tika jar filename. + $tika = realpath($tika_path . '/tika-app-1.7.jar'); + if (!($tika) || !is_file($tika)) { + throw new Exception(t('Invalid path or filename for tika application jar.')); + } + // UTF-8 multibyte characters will be stripped by escapeshellargs() for the + // default C-locale. + // So temporarily set the locale to UTF-8 so that the filepath remains valid. + $backup_locale = setlocale(LC_CTYPE, '0'); + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $param = ''; + if ($file->getMimeType() != 'audio/mpeg') { + $param = ' -Dfile.encoding=UTF8 -cp ' . escapeshellarg($tika_path); + } + + // Force running the Tika jar headless. + $param = ' -Djava.awt.headless=true ' . $param; + + $cmd = escapeshellcmd('java') . $param . ' -jar ' . escapeshellarg($tika) . ' -t ' . escapeshellarg($filepath); + if (strpos(ini_get('extension_dir'), 'MAMP/')) { + $cmd = 'export DYLD_LIBRARY_PATH=""; ' . $cmd; + } + // Restore the locale. + setlocale(LC_CTYPE, $backup_locale); + // Support UTF-8 commands: http://www.php.net/manual/en/function.shell-exec.php#85095 + shell_exec("LANG=en_US.utf-8"); + return shell_exec($cmd); + } + + /** + * Helper function. + * + * @param $uri + * The URI of the file, e.g. public://directory/file.jpg. + * @return mixed + * The real path to the file if it is a local file. An URL otherwise. + */ + protected function get_realpath($uri) { + $wrapper = \Drupal::service('stream_wrapper_manager')->getViaUri($uri); + $scheme = file_uri_scheme($uri); + $local_wrappers = \Drupal::service('stream_wrapper_manager')->getWrappers(StreamWrapperInterface::LOCAL); + if (in_array($scheme, array_keys($local_wrappers))) { + return $wrapper->realpath(); + } + else { + return $wrapper->getExternalUrl(); + } + } + } -- 1.7.10.4 From 26fb25aa1447633bda048f23d8ff8d67c1f64d4d Mon Sep 17 00:00:00 2001 From: Florent Torregrosa Date: Wed, 15 Apr 2015 17:03:48 +0200 Subject: [PATCH 2/2] Issue 2470553: WIP extract solR method. --- .../processor/FilesFieldsProcessorPlugin.php | 158 ++++++++++++++++++-- 1 file changed, 149 insertions(+), 9 deletions(-) diff --git a/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php b/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php index 18ce450..4794e98 100644 --- a/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php +++ b/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php @@ -8,6 +8,10 @@ use Drupal\search_api\Datasource\DatasourceInterface; use Drupal\search_api\Processor\ProcessorPluginBase; use Drupal\Core\StreamWrapper\StreamWrapperInterface; use Drupal\Component\Utility\Xss; +use Drupal\search_api\Entity\Server; +use Drupal\search_api_solr\Plugin\search_api\backend\SearchApiSolrBackend; +use Drupal\search_api_solr\Utility; +use SebastianBergmann\Exporter\Exception; /** * @SearchApiProcessor( @@ -64,7 +68,7 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { // Parse the files. $extraction = ''; foreach ($files as $file) { - if (file_exists($file->getFileUri())) { + if (file_exists($this->get_realpath($file))) { if (in_array($file->getMimeType(), array('text/plain', 'text/x-diff'))) { $extraction .= $this->extract_simple($file); } @@ -76,10 +80,10 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { // // Send the extraction request to the right place depending on the // // current setting. // if ($extraction_method == 'tika') { - $extraction .= $this->extract_tika($file); +// $extraction .= $this->extract_tika($file); // } // else { -// $extraction .= $this->extract_solr($file); + $extraction = $this->extract_solr($file); // } } } @@ -121,7 +125,7 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { * The text extracted from the txt file. */ protected function extract_simple($file) { - $text = file_get_contents($this->get_realpath($file->getFileUri())); + $text = file_get_contents($this->get_realpath($file)); $text = iconv("UTF-8", "UTF-8//IGNORE", $text); $text = Xss::filter(str_replace(array('<', '>'), array(' <', '> '), $text), array()); $text = htmlspecialchars(html_entity_decode($text, ENT_NOQUOTES, 'UTF-8'), ENT_NOQUOTES, 'UTF-8'); @@ -139,7 +143,7 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { */ protected function extract_exif($file) { $ret = ''; - $size = getimagesize($this->get_realpath($file->getFileUri()), $info); + $size = getimagesize($this->get_realpath($file), $info); if (isset($info['APP13'])) { $iptc = iptcparse($info['APP13']); if (is_array($iptc)) { @@ -164,7 +168,7 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { * @throws \Drupal\search_api_attachments\Plugin\search_api\processor\Exception */ protected function extract_tika($file) { - $filepath = $this->get_realpath($file->getFileUri()); + $filepath = $this->get_realpath($file); // TODO: Remove the hardcoded path to tika. $tika_path = realpath('/var/apache-tika'); // TODO: Remove the hardcoded tika jar filename. @@ -197,14 +201,150 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { } /** + * Extract data using Solr (via the ExtractingRequestHandler) or using the + * remote Tika servlet. + * @see http://wiki.apache.org/solr/ExtractingRequestHandler. + * @see http://wiki.apache.org/tika/TikaJAXRS. + */ + protected function extract_solr($file) { + $extraction = FALSE; + $filepath = $this->get_realpath($file); + + try { + $filename = basename($filepath); + + // Load all enabled, not read-only indexes. + $conditions = array( + 'status' => TRUE, + ); + + // @var \Drupal\search_api\Server + $search_api_servers = entity_load_multiple_by_properties('search_api_server', $conditions); + + // TODO: multiple server instances. + if (count($search_api_servers) == 1) { + + $server_name = key($search_api_servers); // TODO: Do it in a better way. + $server = Server::load($server_name); + + // Check that it is a solr server. + if ($server->getBackend()->getPluginId() == 'search_api_solr') { + // Check the connection to the solr server. + if ($server->getBackend()->ping()) { + // Get solarium client. + $solarium_client = $server->getBackend()->getSolrConnection(); + + // Prepare the extraction options. + // Heavily inspired by apachesolr_attachments. + // Construct a multi-part form-data POST body in $data. + $boundary = '--' . md5(uniqid(REQUEST_TIME)); + $data = "--{$boundary}\r\n"; + // The 'filename' used here becomes the property name in the response. + $data .= 'Content-Disposition: form-data; name="file"; filename="extracted"'; + $data .= "\r\nContent-Type: application/octet-stream\r\n\r\n"; + $data .= file_get_contents($filepath); + $data .= "\r\n--{$boundary}--\r\n"; + $headers = array('Content-Type' => 'multipart/form-data; boundary=' . $boundary); + $options = array( + 'method' => 'POST', + 'headers' => $headers, + 'data' => $data, + ); + $options = array(); + + // Create extract query. + $extract_query = $solarium_client->createExtract($options); + $extract_query->addFieldMapping('fmap.content', 'text'); +// $extract_query->addFieldMapping('invariants', 'text'); +// $extract_query->setUprefix('attr_'); + $extract_query->setCommit(FALSE); + $extract_query->setOmitHeader(FALSE); + + $extract_query->setFile($filepath); + + // add document + $doc = $extract_query->createDocument(); + $doc->id = 'extract-test'; + $doc->some = 'more fields'; + $extract_query->setDocument($doc); + + $extract = $solarium_client->extract($extract_query); +// $extract_result = $server->getBackend()->extractResult($extract_query, $extract); + +// http://wiki.solarium-project.org/index.php/V3:Extract_query#Building_an_extract_query + +// // get an extract query instance and add settings +// $query = $solarium_client->createExtract(); +// $query->addFieldMapping('content', 'text'); +// $query->setUprefix('attr_'); +// $query->setFile('/home/florent/sites/drupal8/sites/default/files/banana.txt'); +// $query->setCommit(true); +// $query->setOmitHeader(false); +// +// // add document +// $doc = $query->createDocument(); +// $doc->id = 'extract-test'; +// $doc->some = 'more fields'; +// $query->setDocument($doc); +// +// // this executes the query and returns the result +// $result = $solarium_client->extract($query); + +// echo 'Extract query executed
'; +// echo 'Query status: ' . $result->getStatus(). '
'; +// echo 'Query time: ' . $result->getQueryTime(); + + +// // Heavily inspired by apachesolr_file. +// // @see apachesolr_file_extract(). +// // Construct a multi-part form-data POST body in $data. +// $boundary = '--' . md5(uniqid(REQUEST_TIME)); +// $data = "--{$boundary}\r\n"; +// // The 'filename' used here becomes the property name in the response. +// $data .= 'Content-Disposition: form-data; name="file"; filename="extracted"'; +// $data .= "\r\nContent-Type: application/octet-stream\r\n\r\n"; +// $data .= file_get_contents('/home/trousselin/Desktop/ddd.pdf'); +// $data .= "\r\n--{$boundary}--\r\n"; +// $headers = array('Content-Type' => 'multipart/form-data; boundary=' . $boundary); +// $options = array( +// 'method' => 'POST', +// 'headers' => $headers, +// 'data' => $data, +// ); +// +// // Make a servlet request using the solr connection. +// $response = $solr_connection->makeServletRequest($servlet_path, $params, $options); +// +// // If we have an extracted response, all is well. +// if (isset($response->extracted)) { +// $extraction = $response->extracted; +// } + } + } + } + } + catch (Exception $e) { + // Log the exception to watchdog. Exceptions from Solr may be transient, + // or indicate a problem with a specific file. + $message = t('Exception occurred sending %filepath to Solr.', array('%filepath' => $file->getFileUri())); + \Drupal::logger('search_api_attachments')->error($message); + watchdog_exception('search_api_attachments', $e); + } + + return $extraction; + } + + /** * Helper function. * - * @param $uri - * The URI of the file, e.g. public://directory/file.jpg. + * @param $file + * A file object. * @return mixed * The real path to the file if it is a local file. An URL otherwise. */ - protected function get_realpath($uri) { + protected function get_realpath($file) { + $uri = $file->getFileUri(); $wrapper = \Drupal::service('stream_wrapper_manager')->getViaUri($uri); $scheme = file_uri_scheme($uri); $local_wrappers = \Drupal::service('stream_wrapper_manager')->getWrappers(StreamWrapperInterface::LOCAL); -- 1.7.10.4