diff -r a9de5111edc9 includes/FeedsSource.inc --- a/includes/FeedsSource.inc Sun Feb 05 21:05:58 2012 +0400 +++ b/includes/FeedsSource.inc Sun Feb 05 21:08:00 2012 +0400 @@ -343,10 +343,12 @@ // Parse. $parser_result = $this->importer->parser->parse($this, $this->fetcher_result); - module_invoke_all('feeds_after_parse', $this, $parser_result); + if (!empty($parser_result)) { + module_invoke_all('feeds_after_parse', $this, $parser_result); - // Process. - $this->importer->processor->process($this, $parser_result); + // Process. + $this->importer->processor->process($this, $parser_result); + } } catch (Exception $e) { // Do nothing. diff -r a9de5111edc9 libraries/ParserCSV.inc --- a/libraries/ParserCSV.inc Sun Feb 05 21:05:58 2012 +0400 +++ b/libraries/ParserCSV.inc Sun Feb 05 21:08:00 2012 +0400 @@ -74,6 +74,8 @@ public function __construct() { $this->delimiter = ','; + $this->from_encoding = $this->to_encoding = 'UTF-8'; + $this->check_encoding = FALSE; $this->skipFirstLine = FALSE; $this->columnNames = FALSE; $this->timeout = FALSE; @@ -92,6 +94,22 @@ } /** + * Set the source file encoding. + * By default, UTF-8. + */ + public function setEncoding($encoding) { + $this->from_encoding = $encoding; + } + + /** + * Set the option to check source file encoding. + * By default, FALSE. + */ + public function setEncodingCheck($check_encoding) { + $this->check_encoding = $check_encoding; + } + + /** * Set this to TRUE if the parser should skip the first line of the CSV text, * which might be desired if the first line contains the column names. * By default, this is set to FALSE and the first line is not skipped. @@ -194,7 +212,7 @@ for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) { // Make really sure we've got lines without trailing newlines. - $line = trim($lineIterator->current(), "\r\n"); + $line = trim($this->fixEncoding($lineIterator->current()), "\r\n"); // Skip empty lines. if (empty($line)) { @@ -232,7 +250,7 @@ } // Ok, so, on with fetching the next line, as mentioned above. $currentField .= "\n"; - $line = trim($lineIterator->current(), "\r\n"); + $line = trim($this->fixEncoding($lineIterator->current()), "\r\n"); $currentIndex = 0; continue; } @@ -258,35 +276,41 @@ $nextQuoteIndex = strpos($line, '"', $currentIndex); $nextDelimiterIndex = strpos($line, $this->delimiter, $currentIndex); - if ($nextQuoteIndex === FALSE) { - $nextIndex = $nextDelimiterIndex; - } - elseif ($nextDelimiterIndex === FALSE) { - $nextIndex = $nextQuoteIndex; - } - else { - $nextIndex = min($nextQuoteIndex, $nextDelimiterIndex); - } - - if ($nextIndex === FALSE) { - // This line is done, add the rest of it as last field. - $currentField .= substr($line, $currentIndex); - $fields[] = $currentField; - break; - } - elseif ($line[$nextIndex] === $this->delimiter[0]) { - $length = ($nextIndex + strlen($this->delimiter) - 1) - $currentIndex; - $currentField .= substr($line, $currentIndex, $length); - $fields[] = $currentField; - $currentField = ''; - $currentIndex += $length + 1; - // Continue with the next field. - } - else { // $line[$nextIndex] == '"' - $quoted = TRUE; - $currentField .= substr($line, $currentIndex, $nextIndex - $currentIndex); - $currentIndex = $nextIndex + 1; - // Continue this field in the $quoted == TRUE block. + if ($nextQuoteIndex !== FALSE) { + // Got a double quote. It may only appear in the first position of a field + if ($nextDelimiterIndex === FALSE || ($nextDelimiterIndex > $nextQuoteIndex)) { + // end of line OR farther then double quote + if ($nextQuoteIndex != $currentIndex) { + // Rules violation, unexpected quote + throw new Exception(t('CSV file malformed: unexpected double quote in line: @line, column: @column.', array('@line' => $linesParsed, '@column' => $nextQuoteIndex))); + } else { + //;"... + //;"...; + $quoted = TRUE; + $currentIndex++; + } + } else { + //;...;..." + $currentField .= substr($line, $currentIndex, $nextDelimiterIndex - $currentIndex); + $fields[] = $currentField; + $currentField = ''; + $currentIndex = $nextDelimiterIndex + 1; + } + } else { + //;...;... + //;... + if ($nextDelimiterIndex === FALSE) { + //;... + $currentField .= substr($line, $currentIndex); + $fields[] = $currentField; + break; + } else { + //;...;... + $currentField .= substr($line, $currentIndex, $nextDelimiterIndex - $currentIndex); + $fields[] = $currentField; + $currentField = ''; + $currentIndex = $nextDelimiterIndex + 1; + } } } } @@ -320,4 +344,35 @@ } return $rows; } + + /** + * Checks and converts encoding of input data + * + * @param $data + * A chunk of data + * @return + * Data in correct encoding or throws exceptions if + * ecnoding doesn't match or mbstring is not found. + */ + private function fixEncoding($data) { + // Check encoding if needed + if ($this->check_encoding) { + if (function_exists('mb_convert_encoding')) { + if (!mb_check_encoding($data, $this->from_encoding)) { + throw new Exception(t('Source file is not in @encoding encoding.', array('@encoding' => $this->from_encoding))); + } + } else { + throw new Exception(t('For encoding check mbstring PHP extension must be available.')); + } + } + // Convert encoding if needed + if ($this->from_encoding != $this->to_encoding) { + if (function_exists('mb_convert_encoding')) { + $data = mb_convert_encoding($data, $this->to_encoding, $this->from_encoding); + } else { + throw new Exception(t('For encoding conversion mbstring PHP extension must be available.')); + } + } + return $data; + } } diff -r a9de5111edc9 plugins/FeedsCSVParser.inc --- a/plugins/FeedsCSVParser.inc Sun Feb 05 21:05:58 2012 +0400 +++ b/plugins/FeedsCSVParser.inc Sun Feb 05 21:08:00 2012 +0400 @@ -17,6 +17,8 @@ $parser = new ParserCSV(); $delimiter = $source_config['delimiter'] == 'TAB' ? "\t" : $source_config['delimiter']; $parser->setDelimiter($delimiter); + $parser->setEncoding($source_config['encoding']['encoding']); + $parser->setEncodingCheck($source_config['encoding']['check_encoding']); $iterator = new ParserCSVIterator($fetcher_result->getFilePath()); if (empty($source_config['no_headers'])) { @@ -101,6 +103,8 @@ public function sourceDefaults() { return array( 'delimiter' => $this->config['delimiter'], + 'encoding' => $this->config['encoding'], + 'check_encoding' => $this->config['check_encoding'], 'no_headers' => $this->config['no_headers'], ); } @@ -145,7 +149,7 @@ '#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'), '#default_value' => isset($source_config['no_headers']) ? $source_config['no_headers'] : 0, ); - return $form; + return $form + $this->configEncodingForm(TRUE); } /** @@ -154,6 +158,8 @@ public function configDefaults() { return array( 'delimiter' => ',', + 'encoding' => 'UTF-8', + 'check_encoding' => FALSE, 'no_headers' => 0, ); } @@ -180,6 +186,39 @@ '#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'), '#default_value' => $this->config['no_headers'], ); + return $form + $this->configEncodingForm(); + } + + public function configEncodingForm($sourceForm = FALSE) { + $form = array(); + $defaults = $this->configDefaults(); + $form['encoding'] = array( + '#type' => 'fieldset', + '#title' => 'Encoding conversion', + '#collapsible' => TRUE, + '#collapsed' => $sourceForm || ($this->config['encoding'] == $defaults['encoding'] && $this->config['check_encoding'] == $defaults['check_encoding']), + ); + if (function_exists('mb_list_encodings')) { + $options = mb_list_encodings(); + $options = array_combine($options, $options); + $form['encoding']['encoding'] = array( + '#type' => 'select', + '#title' => t('Source file encoding'), + '#description' => t('Performs encoding conversion of a source files to UTF-8. Defaults to UTF-8 — no encoding conversion will happen.'), + '#options' => $options, + '#default_value' => $this->config['encoding'], + ); + $form['encoding']['check_encoding'] = array( + '#type' => 'checkbox', + '#title' => t('Check encoding'), + '#description' => t('Checks encoding of a source file and breaks import process if encoding differs.'), + '#default_value' => $this->config['check_encoding'] + ); + } else { + $form['encoding']['encoding'] = array( + '#markup' => '

'.t('Encoding conversion is disabled due to the lack of mbstring PHP extension.').'

', + ); + } return $form; }