diff --git a/README.txt b/README.txt index ba56e89..dd112ea 100644 --- a/README.txt +++ b/README.txt @@ -209,6 +209,12 @@ Default: FALSE Description: Flag to stop feeds from using its cURL for http requests. See http_request_use_curl(). +Name: feeds_use_mbstring +Default: TRUE +Description: The extension mbstring is used to convert encodings during parsing. + The reason that this can be turned off is to be able to test Feeds + behavior when the extension is not available. + Glossary ======== diff --git a/feeds.info b/feeds.info index 8929f14..6614113 100644 --- a/feeds.info +++ b/feeds.info @@ -50,6 +50,7 @@ files[] = tests/feeds_fetcher_http.test files[] = tests/feeds_i18n.test files[] = tests/feeds_i18n_node.test files[] = tests/feeds_i18n_taxonomy.test +files[] = tests/feeds_parser_csv.test files[] = tests/feeds_parser_sitemap.test files[] = tests/feeds_parser_syndication.test files[] = tests/feeds_processor_node.test diff --git a/feeds.install b/feeds.install index 8c14aa1..ab26957 100644 --- a/feeds.install +++ b/feeds.install @@ -60,6 +60,7 @@ function feeds_requirements($phase) { function feeds_uninstall() { variable_del('http_request_timeout'); variable_del('feeds_reschedule'); + variable_del('feeds_use_mbstring'); } /** diff --git a/libraries/ParserCSV.inc b/libraries/ParserCSV.inc index 4ddc77a..5380201 100644 --- a/libraries/ParserCSV.inc +++ b/libraries/ParserCSV.inc @@ -66,6 +66,8 @@ class ParserCSVIterator implements Iterator { */ class ParserCSV { private $delimiter; + private $fromEncoding; + private $toEncoding; private $skipFirstLine; private $columnNames; private $timeout; @@ -73,9 +75,12 @@ class ParserCSV { private $startByte; private $lineLimit; private $lastLinePos; + private $useMbString; public function __construct() { $this->delimiter = ','; + $this->fromEncoding = 'UTF-8'; + $this->toEncoding = 'UTF-8'; $this->skipFirstLine = FALSE; $this->columnNames = FALSE; $this->timeout = FALSE; @@ -84,6 +89,9 @@ class ParserCSV { $this->lineLimit = 0; $this->lastLinePos = 0; ini_set('auto_detect_line_endings', TRUE); + if (extension_loaded('mbstring') && variable_get('feeds_use_mbstring', TRUE)) { + $this->useMbString = TRUE; + } } /** @@ -95,6 +103,18 @@ class ParserCSV { } /** + * Sets the source file encoding. + * + * By default, the encoding is UTF-8. + * + * @param string $encoding + * The encoding to set. + */ + public function setEncoding($encoding) { + $this->fromEncoding = $encoding; + } + + /** * Set this to TRUE if the parser should skip the first line of the CSV text, * which might be desired if the first line contains the column names. * By default, this is set to FALSE and the first line is not skipped. @@ -197,7 +217,7 @@ class ParserCSV { for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) { // Make really sure we've got lines without trailing newlines. - $line = trim($lineIterator->current(), "\r\n"); + $line = trim($this->fixEncoding($lineIterator->current()), "\r\n"); // Skip empty lines. if (empty($line)) { @@ -237,7 +257,7 @@ class ParserCSV { } // Ok, so, on with fetching the next line, as mentioned above. $currentField .= "\n"; - $line = trim($lineIterator->current(), "\r\n"); + $line = trim($this->fixEncoding($lineIterator->current()), "\r\n"); $currentIndex = 0; continue; } @@ -325,4 +345,36 @@ class ParserCSV { } return $rows; } + + /** + * Converts encoding of input data. + * + * @param string $data + * A chunk of data. + * + * @return string + * The encoded data. + * + * @throws ParserCSVEncodingException + * Thrown when a given encoding does not match. + */ + public function fixEncoding($data) { + if ($this->useMbString && $this->toEncoding != $this->fromEncoding) { + if (mb_check_encoding($data, $this->fromEncoding)) { + // Convert encoding. The conversion is to UTF-8 by default to prevent + // SQL errors. + $data = mb_convert_encoding($data, $this->toEncoding, $this->fromEncoding); + } + else { + throw new ParserCSVEncodingException(t('Source file is not in %encoding encoding.', array('%encoding' => $this->fromEncoding))); + } + } + + return $data; + } } + +/** + * Exception thrown when an encoding error occurs during parsing. + */ +class ParserCSVEncodingException extends Exception {} diff --git a/plugins/FeedsCSVParser.inc b/plugins/FeedsCSVParser.inc index e6be83f..f9caeaf 100644 --- a/plugins/FeedsCSVParser.inc +++ b/plugins/FeedsCSVParser.inc @@ -22,6 +22,10 @@ class FeedsCSVParser extends FeedsParser { $parser = new ParserCSV(); $delimiter = $this->getDelimiterChar($source_config); $parser->setDelimiter($delimiter); + if (isset($source_config['encoding'])) { + // Encoding can only be set when the mbstring extension is loaded. + $parser->setEncoding($source_config['encoding']); + } $iterator = new ParserCSVIterator($fetcher_result->getFilePath()); if (empty($source_config['no_headers'])) { @@ -114,6 +118,7 @@ class FeedsCSVParser extends FeedsParser { public function sourceDefaults() { return array( 'delimiter' => $this->config['delimiter'], + 'encoding' => $this->config['encoding'], 'no_headers' => $this->config['no_headers'], ); } @@ -172,6 +177,10 @@ class FeedsCSVParser extends FeedsParser { '#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'), '#default_value' => isset($source_config['no_headers']) ? $source_config['no_headers'] : 0, ); + $form['encoding'] = $this->configEncodingForm(); + if (isset($source_config['encoding'])) { + $form['encoding']['#default_value'] = $source_config['encoding']; + } return $form; } @@ -181,6 +190,7 @@ class FeedsCSVParser extends FeedsParser { public function configDefaults() { return array( 'delimiter' => ',', + 'encoding' => 'UTF-8', 'no_headers' => 0, ); } @@ -203,9 +213,42 @@ class FeedsCSVParser extends FeedsParser { '#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'), '#default_value' => $this->config['no_headers'], ); + $form['encoding'] = $this->configEncodingForm(); return $form; } + /** + * Builds configuration field for setting file encoding. + * + * If the mbstring extension is not available a markup render array + * will be returned instead. + * + * @return array + * A renderable array. + */ + public function configEncodingForm() { + if (extension_loaded('mbstring') && variable_get('feeds_use_mbstring', TRUE)) { + // Get the system's list of available encodings. + $options = mb_list_encodings(); + // Make the key/values the same in the array. + $options = array_combine($options, $options); + // Sort alphabetically not-case sensitive. + natcasesort($options); + return array( + '#type' => 'select', + '#title' => t('File encoding'), + '#description' => t('Performs character encoding conversion from selected option to UTF-8.'), + '#options' => $options, + '#default_value' => $this->config['encoding'], + ); + } + else { + return array( + '#markup' => '' . t('PHP mbstring extension must be available for character encoding conversion.') . '', + ); + } + } + public function getTemplate() { $mappings = feeds_importer($this->id)->processor->config['mappings']; $sources = $uniques = array(); diff --git a/tests/feeds/encoding.csv.php b/tests/feeds/encoding.csv.php new file mode 100644 index 0000000..05f300c --- /dev/null +++ b/tests/feeds/encoding.csv.php @@ -0,0 +1,33 @@ + 'CSV parser functional tests', + 'description' => 'Tests the CSV parser using the UI.', + 'group' => 'Feeds', + ); + } + + /** + * Tests parsing a CSV when the mbstring extension is not available. + */ + public function testMbstringExtensionDisabled() { + // Set "feeds_use_mbstring" to FALSE to emulate that the mbstring extension + // is not loaded. + variable_set('feeds_use_mbstring', FALSE); + + // Remove items after parsing because in < PHP 5.4 processing items with + // encoding issues leads to test failures because check_plain() can only + // handle UTF-8 encoded strings. + // @see feeds_tests_feeds_after_parse() + variable_set('feeds_tests_feeds_after_parse_empty_items', TRUE); + + // Create node type. + $node_type = $this->drupalCreateContentType(); + + // Create and configure importer. + $this->createImporterConfiguration('Content CSV', 'csv'); + $this->setPlugin('csv', 'FeedsFileFetcher'); + $this->setPlugin('csv', 'FeedsCSVParser'); + $this->setSettings('csv', 'FeedsNodeProcessor', array('bundle' => $node_type->type)); + $this->addMappings('csv', array( + 0 => array( + 'source' => 'id', + 'target' => 'guid', + ), + 1 => array( + 'source' => 'text', + 'target' => 'title', + ), + )); + + // Ensure that on the CSV parser settings page a message is shown about that + // the mbstring extension is not available. + $this->drupalGet('admin/structure/feeds/csv/settings/FeedsCSVParser'); + $this->assertNoField('encoding'); + $this->assertText('PHP mbstring extension must be available for character encoding conversion.'); + + // Try to import a CSV file that is not UTF-8 encoded. No encoding warning + // should be shown, but import should fail. + $this->importFile('csv', $this->absolutePath() . '/tests/feeds/encoding_SJIS.csv'); + $this->assertNoText('Source file is not in UTF-8 encoding.'); + } + + /** + * Tests an encoding failure during parsing a CSV. + */ + public function testEncodingFailure() { + // Create node type. + $node_type = $this->drupalCreateContentType(); + + // Create and configure importer. + $this->createImporterConfiguration('Content CSV', 'csv'); + $this->setPlugin('csv', 'FeedsFileFetcher'); + $this->setPlugin('csv', 'FeedsCSVParser'); + $this->setSettings('csv', 'FeedsNodeProcessor', array('bundle' => $node_type->type)); + $this->addMappings('csv', array( + 0 => array( + 'source' => 'id', + 'target' => 'guid', + ), + 1 => array( + 'source' => 'text', + 'target' => 'title', + ), + )); + + // Ensure that on the CSV parser settings page a setting for encoding is + // shown. + $this->drupalGet('admin/structure/feeds/csv/settings/FeedsCSVParser'); + $this->assertField('encoding'); + $this->assertNoText('PHP mbstring extension must be available for character encoding conversion.'); + + // Try to import a CSV file that is not UTF-8 encoded. Import should be + // halted and an encoding warning should be shown. + $this->importFile('csv', $this->absolutePath() . '/tests/feeds/encoding_SJIS.csv'); + $this->assertNoText('Failed importing 4 nodes.'); + $this->assertText('Source file is not in UTF-8 encoding.'); + } +} diff --git a/tests/feeds_tests.module b/tests/feeds_tests.module index fa89462..a9c4d5e 100644 --- a/tests/feeds_tests.module +++ b/tests/feeds_tests.module @@ -351,6 +351,24 @@ function feeds_tests_mapper_unique(FeedsSource $source, $entity_type, $bundle, $ } /** + * Implements hook_feeds_after_parse(). + * + * Empties the list of items to import in case the test says that there are + * items in there with encoding issues. These items can not be processed during + * tests without having a test failure because in < PHP 5.4 that would produce + * the following warning: + * htmlspecialchars(): Invalid multibyte sequence in argument + * + * @see FeedsCSVParserTestCase::testMbstringExtensionDisabled() + */ +function feeds_tests_feeds_after_parse(FeedsSource $source, FeedsParserResult $result) { + if (variable_get('feeds_tests_feeds_after_parse_empty_items', FALSE)) { + // Remove all items. No items will be processed. + $result->items = array(); + } +} + +/** * Helper class to ensure callbacks can be objects. */ class FeedsTestsPreprocess { diff --git a/tests/parser_csv.test b/tests/parser_csv.test index 2905b39..c5cb324 100644 --- a/tests/parser_csv.test +++ b/tests/parser_csv.test @@ -32,6 +32,8 @@ class ParserCSVTest extends DrupalWebTestCase { $this->_testSimple(); $this->_testBatching(); + $this->_testEncodingConversion(); + $this->_testEncodingConversionFailure(); } /** @@ -54,6 +56,51 @@ class ParserCSVTest extends DrupalWebTestCase { } /** + * Simple test of encoding conversion prior to parsing. + */ + protected function _testEncodingConversion() { + // Pull in the $control_result array. + include $this->absolutePath() . '/tests/feeds/encoding.csv.php'; + + $encodings = $this->getEncodings(); + foreach ($encodings as $encoding) { + $file = $this->absolutePath() . "/tests/feeds/encoding_{$encoding}.csv"; + $iterator = new ParserCSVIterator($file); + $parser = new ParserCSV(); + $parser->setDelimiter(','); + $parser->setEncoding($encoding); + $rows = $parser->parse($iterator); + $this->assertFalse($parser->lastLinePos(), format_string('CSV reports all lines parsed, with encoding: %encoding', array('%encoding' => $encoding))); + $this->assertEqual(md5(serialize($rows)), md5(serialize($control_result)), 'Converted and parsed result matches control result.'); + } + } + + /** + * Simple test of failed encoding conversion prior to parsing. + */ + protected function _testEncodingConversionFailure() { + // Pull in the $control_result array. + include $this->absolutePath() . '/tests/feeds/encoding.csv.php'; + + $encodings = $this->getEncodings(); + foreach ($encodings as $encoding) { + $file = $this->absolutePath() . "/tests/feeds/encoding_{$encoding}.csv"; + $iterator = new ParserCSVIterator($file); + $parser = new ParserCSV(); + $parser->setDelimiter(','); + // Attempt to read file as UTF-8. + $parser->setEncoding('UTF-8'); + try { + $rows = $parser->parse($iterator); + $this->fail('Incorrect conversion attempt throws exception.'); + } + catch (ParserCSVEncodingException $e) { + $this->assertNotNull($e->getMessage(), 'Incorrect conversion attempt throws exception.'); + } + } + } + + /** * Test batching. */ protected function _testBatching() { @@ -100,4 +147,11 @@ class ParserCSVTest extends DrupalWebTestCase { 'tab' => "\t", ); } + + static function getEncodings() { + return array( + 'SJIS-win', + 'SJIS', + ); + } }