From 784e802c237e26f1f101c837bd3498ac2a6f38ff Mon Sep 17 00:00:00 2001 From: Florent Torregrosa Date: Tue, 14 Apr 2015 16:36:49 +0200 Subject: [PATCH] Issue #2470553: Allows search_api_attachments to parse txt, img files and other files with tika. --- .../processor/FilesFieldsProcessorPlugin.php | 152 +++++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) diff --git a/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php b/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php index 8fd68d1..18ce450 100644 --- a/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php +++ b/src/Plugin/search_api/processor/FilesFieldsProcessorPlugin.php @@ -6,6 +6,8 @@ use Drupal\Core\TypedData\DataDefinition; use Drupal\field\Entity\FieldConfig; use Drupal\search_api\Datasource\DatasourceInterface; use Drupal\search_api\Processor\ProcessorPluginBase; +use Drupal\Core\StreamWrapper\StreamWrapperInterface; +use Drupal\Component\Utility\Xss; /** * @SearchApiProcessor( @@ -46,11 +48,55 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { if (!($field = $item->getField('search_api_attachments_' . $field_name))) { continue; } - $field->addValue('test test'); + + // Need to retrieve the files. + $entity = $item->getOriginalObject()->getValue(); + $filefield_values = $entity->get($field_name)->getValue(); + + $fids = array(); + foreach ($filefield_values as $filefield_value) { + $fids[] = $filefield_value['target_id']; + } + + // Retrieve the files. + $files = entity_load_multiple('file', $fids); + + // Parse the files. + $extraction = ''; + foreach ($files as $file) { + if (file_exists($file->getFileUri())) { + if (in_array($file->getMimeType(), array('text/plain', 'text/x-diff'))) { + $extraction .= $this->extract_simple($file); + } + elseif (in_array($file->getMimeType(), array('image/jpeg', 'image/jpg', 'image/tiff'))) { + $extraction .= $this->extract_exif($file); + } + else { +// $extraction_method = variable_get('search_api_attachments_extract_using', 'tika'); +// // Send the extraction request to the right place depending on the +// // current setting. +// if ($extraction_method == 'tika') { + $extraction .= $this->extract_tika($file); +// } +// else { +// $extraction .= $this->extract_solr($file); +// } + } + } + } + + // Add the value in the indexed data. + $field->addValue($extraction); } } } + /** + * Helper function. + * + * @return array + * An array of file fields. + */ protected function getFileFields() { $file_fields = array(); // Retrieve file fields of indexed bundles. @@ -66,4 +112,108 @@ class FilesFieldsProcessorPlugin extends ProcessorPluginBase { return $file_fields; } + /** + * Extract txt file. + * + * @param $file + * A file object. + * @return string + * The text extracted from the txt file. + */ + protected function extract_simple($file) { + $text = file_get_contents($this->get_realpath($file->getFileUri())); + $text = iconv("UTF-8", "UTF-8//IGNORE", $text); + $text = Xss::filter(str_replace(array('<', '>'), array(' <', '> '), $text), array()); + $text = htmlspecialchars(html_entity_decode($text, ENT_NOQUOTES, 'UTF-8'), ENT_NOQUOTES, 'UTF-8'); + $text = trim($text); + return $text; + } + + /** + * Extract img file. + * + * @param $file + * A file object. + * @return string + * The text extracted from the txt file. + */ + protected function extract_exif($file) { + $ret = ''; + $size = getimagesize($this->get_realpath($file->getFileUri()), $info); + if (isset($info['APP13'])) { + $iptc = iptcparse($info['APP13']); + if (is_array($iptc)) { + foreach ($iptc as $key => $value) { + foreach ($value as $innerkey => $innervalue) { + $ret .= $innervalue . ' '; + } + } + } + } + return $ret; + } + + /** + * Extract file with Tika library. + * + * @param $file + * A file object that implements FileInterface. + * + * @return string + * The text extracted from the file. + * @throws \Drupal\search_api_attachments\Plugin\search_api\processor\Exception + */ + protected function extract_tika($file) { + $filepath = $this->get_realpath($file->getFileUri()); + // TODO: Remove the hardcoded path to tika. + $tika_path = realpath('/var/apache-tika'); + // TODO: Remove the hardcoded tika jar filename. + $tika = realpath($tika_path . '/tika-app-1.7.jar'); + if (!($tika) || !is_file($tika)) { + throw new Exception(t('Invalid path or filename for tika application jar.')); + } + // UTF-8 multibyte characters will be stripped by escapeshellargs() for the + // default C-locale. + // So temporarily set the locale to UTF-8 so that the filepath remains valid. + $backup_locale = setlocale(LC_CTYPE, '0'); + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $param = ''; + if ($file->getMimeType() != 'audio/mpeg') { + $param = ' -Dfile.encoding=UTF8 -cp ' . escapeshellarg($tika_path); + } + + // Force running the Tika jar headless. + $param = ' -Djava.awt.headless=true ' . $param; + + $cmd = escapeshellcmd('java') . $param . ' -jar ' . escapeshellarg($tika) . ' -t ' . escapeshellarg($filepath); + if (strpos(ini_get('extension_dir'), 'MAMP/')) { + $cmd = 'export DYLD_LIBRARY_PATH=""; ' . $cmd; + } + // Restore the locale. + setlocale(LC_CTYPE, $backup_locale); + // Support UTF-8 commands: http://www.php.net/manual/en/function.shell-exec.php#85095 + shell_exec("LANG=en_US.utf-8"); + return shell_exec($cmd); + } + + /** + * Helper function. + * + * @param $uri + * The URI of the file, e.g. public://directory/file.jpg. + * @return mixed + * The real path to the file if it is a local file. An URL otherwise. + */ + protected function get_realpath($uri) { + $wrapper = \Drupal::service('stream_wrapper_manager')->getViaUri($uri); + $scheme = file_uri_scheme($uri); + $local_wrappers = \Drupal::service('stream_wrapper_manager')->getWrappers(StreamWrapperInterface::LOCAL); + if (in_array($scheme, array_keys($local_wrappers))) { + return $wrapper->realpath(); + } + else { + return $wrapper->getExternalUrl(); + } + } + } -- 1.7.10.4