From 7f41a08b470902ebcab4977dd731d275d9248593 Mon Sep 17 00:00:00 2001
From: OnkelTem <OnkelTem@239962.no-reply.drupal.org>
Date: Thu, 9 Aug 2012 16:20:38 +0400
Subject: [PATCH] Issue #428272 by OnkelTem: Added support of encoding
 conversions to the CSV Parser

---
 libraries/ParserCSV.inc    |   55 ++++++++++++++++++++++++++++++++++++++++++--
 plugins/FeedsCSVParser.inc |   42 ++++++++++++++++++++++++++++++++-
 2 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/libraries/ParserCSV.inc b/libraries/ParserCSV.inc
index 6a3ff70..a495c59 100644
--- a/libraries/ParserCSV.inc
+++ b/libraries/ParserCSV.inc
@@ -76,6 +76,8 @@ class ParserCSV {
 
   public function __construct() {
     $this->delimiter = ',';
+    $this->from_encoding = $this->to_encoding = 'UTF-8';
+    $this->check_encoding = FALSE;
     $this->skipFirstLine = FALSE;
     $this->columnNames = FALSE;
     $this->timeout = FALSE;
@@ -94,6 +96,22 @@ class ParserCSV {
   }
 
   /**
+   * Set the source file encoding.
+   * By default, UTF-8.
+   */
+  public function setEncoding($encoding) {
+    $this->from_encoding = $encoding;
+  }
+
+  /**
+   * Set the option to check source file encoding.
+   * By default, FALSE.
+   */
+  public function setEncodingCheck($check_encoding) {
+    $this->check_encoding = $check_encoding;
+  }
+
+  /**
    * Set this to TRUE if the parser should skip the first line of the CSV text,
    * which might be desired if the first line contains the column names.
    * By default, this is set to FALSE and the first line is not skipped.
@@ -196,7 +214,7 @@ class ParserCSV {
     for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) {
 
       // Make really sure we've got lines without trailing newlines.
-      $line = trim($lineIterator->current(), "\r\n");
+      $line = trim($this->fixEncoding($lineIterator->current()), "\r\n");
 
       // Skip empty lines.
       if (empty($line)) {
@@ -236,7 +254,7 @@ class ParserCSV {
             }
             // Ok, so, on with fetching the next line, as mentioned above.
             $currentField .= "\n";
-            $line = trim($lineIterator->current(), "\r\n");
+            $line = trim($this->fixEncoding($lineIterator->current()), "\r\n");
             $currentIndex = 0;
             continue;
           }
@@ -324,4 +342,37 @@ class ParserCSV {
     }
     return $rows;
   }
+
+  /**
+   * Checks and converts encoding of input data
+   *
+   * @param $data
+   *   A chunk of data
+   * @return
+   *   Data in correct encoding or throws exceptions if
+   *   ecnoding doesn't match or mbstring is not found.
+   */
+  private function fixEncoding($data) {
+    // Check encoding if needed
+    if ($this->check_encoding) {
+      if (function_exists('mb_convert_encoding')) {
+        if (!mb_check_encoding($data, $this->from_encoding)) {
+          throw new Exception(t('Source file is not in @encoding encoding.', array('@encoding' => $this->from_encoding)));
+        }
+      }
+      else {
+        throw new Exception(t('For encoding check <code>mbstring</code> PHP extension must be available.'));
+      }
+    }
+    // Convert encoding if needed
+    if ($this->from_encoding != $this->to_encoding) {
+      if (function_exists('mb_convert_encoding')) {
+        $data = mb_convert_encoding($data, $this->to_encoding, $this->from_encoding);
+      }
+      else {
+        throw new Exception(t('For encoding conversion <code>mbstring</code> PHP extension must be available.'));
+      }
+    }
+    return $data;
+  }
 }
diff --git a/plugins/FeedsCSVParser.inc b/plugins/FeedsCSVParser.inc
index 337dd68..32d4a80 100644
--- a/plugins/FeedsCSVParser.inc
+++ b/plugins/FeedsCSVParser.inc
@@ -22,6 +22,8 @@ class FeedsCSVParser extends FeedsParser {
     $parser = new ParserCSV();
     $delimiter = $source_config['delimiter'] == 'TAB' ? "\t" : $source_config['delimiter'];
     $parser->setDelimiter($delimiter);
+    $parser->setEncoding($source_config['encoding']['encoding']);
+    $parser->setEncodingCheck($source_config['encoding']['check_encoding']);
 
     $iterator = new ParserCSVIterator($fetcher_result->getFilePath());
     if (empty($source_config['no_headers'])) {
@@ -106,6 +108,8 @@ class FeedsCSVParser extends FeedsParser {
   public function sourceDefaults() {
     return array(
       'delimiter' => $this->config['delimiter'],
+      'encoding' => $this->config['encoding'],
+      'check_encoding' => $this->config['check_encoding'],
       'no_headers' => $this->config['no_headers'],
     );
   }
@@ -150,7 +154,7 @@ class FeedsCSVParser extends FeedsParser {
       '#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'),
       '#default_value' => isset($source_config['no_headers']) ? $source_config['no_headers'] : 0,
     );
-    return $form;
+    return $form + $this->configEncodingForm(TRUE);
   }
 
   /**
@@ -159,6 +163,8 @@ class FeedsCSVParser extends FeedsParser {
   public function configDefaults() {
     return array(
       'delimiter' => ',',
+      'encoding' => 'UTF-8',
+      'check_encoding' => FALSE,
       'no_headers' => 0,
     );
   }
@@ -185,6 +191,40 @@ class FeedsCSVParser extends FeedsParser {
       '#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'),
       '#default_value' => $this->config['no_headers'],
     );
+    return $form + $this->configEncodingForm();
+  }
+
+  public function configEncodingForm($sourceForm = FALSE) {
+    $form = array();
+    $defaults = $this->configDefaults();
+    $form['encoding'] = array(
+      '#type' => 'fieldset',
+      '#title' => 'Encoding conversion',
+      '#collapsible' => TRUE,
+      '#collapsed' => $sourceForm || ($this->config['encoding'] == $defaults['encoding'] && $this->config['check_encoding'] == $defaults['check_encoding']),
+    );
+    if (function_exists('mb_list_encodings')) {
+      $options = mb_list_encodings();
+      $options = array_combine($options, $options);
+      $form['encoding']['encoding'] = array(
+        '#type' => 'select',
+        '#title' => t('Source file encoding'),
+        '#description' => t('Performs encoding conversion of a source files to UTF-8. Defaults to <code>UTF-8</code> &mdash; no encoding conversion will happen.'),
+        '#options' => $options,
+        '#default_value' => $this->config['encoding'],
+      );
+      $form['encoding']['check_encoding'] = array(
+        '#type' => 'checkbox',
+        '#title' => t('Check encoding'),
+        '#description' => t('Checks encoding of a source file and breaks import process if encoding differs.'),
+        '#default_value' => $this->config['check_encoding']
+      );
+    }
+    else {
+      $form['encoding']['encoding'] = array(
+        '#markup' => '<p><em>' . t('Encoding conversion is disabled due to the lack of <code>mbstring</code> PHP extension.') . '</em></p>',
+      );
+    }
     return $form;
   }
 
-- 
1.7.9.5

