diff --git a/plugins/strip_non_utf8.inc b/plugins/strip_non_utf8.inc new file mode 100644 index 0000000..373f6cd --- /dev/null +++ b/plugins/strip_non_utf8.inc @@ -0,0 +1,31 @@ + 'feeds_tamper_strip_non_utf8_form', + 'callback' => 'feeds_tamper_strip_non_utf8_callback', + 'name' => 'Strip non-UTF-8', + 'multi' => 'loop', + 'category' => 'Text', +); + +function feeds_tamper_strip_non_utf8_form($importer, $element_key, $settings) { + $form = array(); + $form['html'] = array( + '#markup' => t('Removes invalid UTF-8 sequences from a string.'), + ); + return $form; +} + +function feeds_tamper_strip_non_utf8_callback($result, $item_key, $element_key, &$field, $settings, $source) { + // based on regex from http://stackoverflow.com/a/1401716 + + $regex = '/((?:[\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF]{2}|[\xF0-\xF7][\x80-\xBF]{3}){1,100})|./'; + + $field = preg_replace($regex, '$1', $field); +} + diff --git a/plugins/utf8_encode.inc b/plugins/utf8_encode.inc new file mode 100644 index 0000000..c0cfea0 --- /dev/null +++ b/plugins/utf8_encode.inc @@ -0,0 +1,26 @@ + 'feeds_tamper_utf8_encode_form', + 'callback' => 'feeds_tamper_utf8_encode_callback', + 'name' => 'Convert ISO-8859-1 to UTF-8', + 'multi' => 'loop', + 'category' => 'Text', +); + +function feeds_tamper_utf8_encode_form($importer, $element_key, $settings) { + $form = array(); + $form['html'] = array( + '#markup' => t('This will convert all ISO-8859-1 characters to their UTF-8 equivalents.'), + ); + return $form; +} + +function feeds_tamper_utf8_encode_callback($result, $item_key, $element_key, &$field, $settings, $source) { + $field = drupal_convert_to_utf8($field, 'ISO-8859-1'); +} diff --git a/tests/feeds_tamper_plugins.test b/tests/feeds_tamper_plugins.test index 99ba24e..0b577d7 100644 --- a/tests/feeds_tamper_plugins.test +++ b/tests/feeds_tamper_plugins.test @@ -1020,3 +1020,43 @@ class FeedsTamperUniqueTestCase extends FeedsTamperUnitTestCase { $this->execute(array(1, 1, 2, 3, 4), array(1, 2, 3, 4)); } } + +/** + * Tests for utf8_encode.inc + */ +class FeedsTamperUTF8EncodeTestCase extends FeedsTamperUnitTestCase { + + protected $plugin_id = 'utf8_encode'; + + public static function getInfo() { + return array( + 'name' => 'Plugins: Convert ISO-8859-1 to UTF-8', + 'description' => 'Unit tests for "Convert ISO-8859-1 to UTF-8" plugin.', + 'group' => 'Feeds Tamper', + ); + } + + public function test() { + $this->execute("\x41\xc0\xc1\xc2\xc4\xc3\xc5", 'AÀÁÂÄÃÅ'); + } +} + +/** + * Tests for utf8_estrip_non_utf8ncode.inc + */ +class FeedsTamperStripNonUTF8TestCase extends FeedsTamperUnitTestCase { + + protected $plugin_id = 'strip_non_utf8'; + + public static function getInfo() { + return array( + 'name' => 'Plugins: Strip non-UTF-8', + 'description' => 'Unit tests for "Strip non-UTF-8" plugin.', + 'group' => 'Feeds Tamper', + ); + } + + public function test() { + $this->execute("foo\x80\x81\x92\xE0\x80bar", 'foobar'); + } +}