Index: word2web.inc
===================================================================
RCS file: word2web.inc
diff -N word2web.inc
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ word2web.inc	22 Apr 2009 22:29:59 -0000
@@ -0,0 +1,133 @@
+<?php
+// $Id$
+/**
+ * @file
+ * Helper functions standard word2web modules.
+ *
+ * TODO: It'd be nice to allow users to toggle between simple quotes and the
+ * html entities as show on http://shiflett.org/blog/2005/oct/convert-smart-quotes-with-php
+ */
+
+/**
+ * Utility function to replace microsoft "smart" characters into usable
+ * UTF-8. From http://shiflett.org/blog/2005/oct/convert-smart-quotes-with-php
+ */
+function _word2web_convert_chr($string) {
+  $search = array(
+    chr(145),
+    chr(146),
+    chr(147),
+    chr(148),
+    chr(151),
+  );
+
+  $replace = array(
+    "'",
+    "'",
+    '"',
+    '"',
+    '-',
+  );
+
+  return str_replace($search, $replace, $string);
+}
+
+/**
+ * Helper function that strips out MS Word tags.
+ *
+ * @param $html
+ * A raw HTML string containing MS Word tags.
+ * @param $strip_images
+ * Optional. Boolean value that triggers the stripping of image tags.
+ * @return
+ * Cleaned up HTML.
+ */
+function _word2web_filter($html, $strip_images = FALSE) {
+
+  // This is useful but breaks other utf8 characters.
+//  $html = _word2web_convert_chr($html);
+
+  preg_match('/charset=([\w-]+)/', $html, $matches);
+  if ($matches[1] == 'windows-1256') {
+    $html = iconv('windows-1256', 'utf-8', $html);
+  }
+  if ($matches[1] == 'windows-1252') {
+    $html = iconv('windows-1252', 'utf-8', $html);
+  }
+
+  // If we want to strip images we just skip converting them.
+  if (!$strip_images) {
+    // Convert MS Word image tags into more HTML standard tags so they aren't
+    // filtered out below. They still exists in the html for now.
+    $html = _word2web_covert_image_tags($html);
+  }
+
+  $html = iconv('UTF-8', 'UTF-8//IGNORE', $html);
+  $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
+
+  // Apply our XSL transformations.
+  $path = drupal_get_path('module', 'word2web') . '/';
+  $html = _word2web_xslt_transform($html, $path .'empty.xsl');
+  $html = _word2web_xslt_transform($html, $path .'w2html.xslt');
+
+  return $html;
+}
+
+/**
+ * Finds word image tags and converts them into html style image tags.
+ *
+ * Note: Old tag is not removed and expected to be cleaned up by the caller.
+ *
+ * @param $html
+ * HTML string containing MS Word tags.
+ * @return
+ * HTML string.
+ */
+function _word2web_covert_image_tags($html) {
+  //
+  $html_dom = new DOMDocument();
+  // This is guaranteed not to be entirely valid HTML so suppress the errors
+  // reminding us of this fact.
+  @$html_dom->loadHTML($html);
+
+  // TODO do something with v:imagedata information?
+  // $vimages = $html->getElementsByTagName("v:imagedata");
+
+  /*
+   * This little bit changes img tags into image tags so that they aren't wiped
+   * out by the (validating) XSL transformations.
+   */
+  $images = $html_dom->getElementsByTagName('img');
+  foreach ($images as $im) {
+    $image_node = $html_dom->createElement('image');
+    $image_node->setAttribute('src', $im->getAttribute('src'));
+    $im->parentNode->insertBefore($image_node, $im);
+  }
+  return $html_dom->saveXML();
+}
+
+/**
+ * Utility function for crunching XML through XSLT
+ */
+function _word2web_xslt_transform($xml, $xsl_file, $params = array()) {
+  // load specified stylesheet and set any parameters
+  // Without Domdocument charsets: french breaks, arabic remains broken
+  //
+  $xsl = new DOMDocument();
+  $xsl->load($xsl_file);
+  $xslt = new XSLTProcessor();
+  $xslt->importStylesheet($xsl);
+  // check whether input is string or object
+  if (!is_object($xml)) {
+    $x = new DOMDocument();
+    // This is guaranteed not to be entirely valid HTML so suppress the errors
+    // reminding us of this fact.
+    @$x->loadHTML($xml);
+  }
+  else {
+    $x = $xml;
+  }
+  // return transformed xml
+  return $xslt->transformToXML($x);
+}
+
Index: word2web.module
===================================================================
RCS file: /cvs/drupal-contrib/contributions/modules/word2web/word2web.module,v
retrieving revision 1.6.2.2
diff -u -p -w -r1.6.2.2 word2web.module
--- word2web.module	22 Jan 2009 00:27:32 -0000	1.6.2.2
+++ word2web.module	22 Apr 2009 22:29:59 -0000
@@ -243,43 +243,9 @@ function word2web_nodeapi(&$node, $op) {
   $validators = array(
   );
       if ($html_file = file_save_upload('word_document', $validators)) {
-        $path = drupal_get_path('module', 'word2web');
-        $html_raw = file_get_contents($html_file->filepath);
-        $html_raw = _word2web_convert_chr($html_raw);
-        set_error_handler('_word2web_suppress_errors');
-
-        preg_match('/charset=([\w-]+)/', $html_raw, $matches);
-        
-        if ($matches[1] == 'windows-1256') {
-          $html_raw = iconv('windows-1256', 'utf-8', $html_raw);
-        }
-        if ($matches[1] == 'windows-1252') {
-          $html_raw = iconv('windows-1252', 'utf-8', $html_raw);
-        } 
-        $html = new DOMDocument();
-        $html->loadHTML($html_raw);
-        $images = $html->getElementsByTagName("img");
-        $vimages = $html->getElementsByTagName("v:imagedata");
-        /*
-         * This little bit changes img tags into 
-         * image tags so that they aren't wiped out by the (validating)
-         * XSL transformations. It also collects a list of URLs of images
-         * so that we'll know what to fetch from the user
-         */
-        foreach ($images as $im) {
-          $image_node = $html->createElement("image");
-          $image_node->appendChild($html->createTextNode("&nbsp;"));
-          $image_node->setAttribute("src", $im->getAttribute("src"));
-          $im->parentNode->insertBefore($image_node, $im);
-        }
-        $html = $html->saveXML();
-        $html = iconv("UTF-8", "UTF-8//IGNORE", $html);
-        $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); 
-        $html = _word2web_xslt_transform($html, $path .'/empty.xsl');
-        $html = _word2web_xslt_transform($html, $path .'/w2html.xslt');
-        // Somehow /> is getting produced at certain places in the document -- let's take them out 
-        $html = str_replace('/&gt;', '', $html);
-        restore_error_handler();
+        // Include our helper file.
+        module_load_include('inc', 'word2web');
+        $html = _word2web_filter(file_get_contents($html_file->filepath));
 
         if (variable_get('word2web_filter', 0)) {
           $node->format = variable_get('word2web_filter', 0);
@@ -337,41 +303,14 @@ function _word2web_get_images($c) {
  */
 
 function word2web_manual($html_raw) {
-  $html_raw = _word2web_convert_chr($html_raw);
-  preg_match('/charset=([\w-]+)/', $html_raw, $matches);
         
-  if ($matches[1] == 'windows-1256') {
-      $html_raw = iconv('windows-1256', 'utf-8', $html_raw);
-    }
-  if ($matches[1] == 'windows-1252') {
-      $html_raw = iconv('windows-1252', 'utf-8', $html_raw);
-    }
-  $path = drupal_get_path('module', 'word2web');
-  set_error_handler('_word2web_suppress_errors');
-  $html = $html_raw;
   $html_raw = preg_replace("/<(img)([^>]*)>/mi",
   "<addr class='image' $2>-</addr>",
   $html_raw);
 
-  //preg_match('/charset=([\w-]+)/', $html_raw, $matches);
-  // This step apparently cleans up the XML a little.
-  $html = new DOMDocument();
-  $html->loadHTML($html_raw);
-  //echo "charset? ".$charset;
-  $html = $html->saveXML();
-  // Normalizes some odd utf-8 characters
-  
-  // Convert from the charset it says it is into UTF-8
-  //$html = iconv($matches[1], "UTF-8//IGNORE", $html);
-  $html = iconv("UTF-8", "UTF-8//IGNORE", $html);
-        $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); 
-  // Actually run the xsl transformations
-  $html = _word2web_xslt_transform($html, $path .'/empty.xsl');
-
-  $html = _word2web_xslt_transform($html, $path .'/w2html.xslt'); 
-  restore_error_handler();
-  $html = str_replace('/&gt;', '', $html);
-  return $html;
+  // Include our helper file.
+  module_load_include('inc', 'word2web');
+  return _word2web_filter($html_raw);
 }
 
 /**
@@ -434,54 +373,6 @@ function word2web_settings_form() {
 }
 
 /**
- * Utility function to replace microsoft "smart" characters into usable
- * UTF-8. From http://shiflett.org/blog/2005/oct/convert-smart-quotes-with-php
- */
-function _word2web_convert_chr($string) {
-  $search = array(
-    chr(145),
-    chr(146),
-    chr(147),
-    chr(148),
-    chr(151)
-  );
- 
-  $replace = array(
-    "'", 
-    "'", 
-    '"', 
-    '"', 
-    '-'
-  );
-
-  return str_replace($search, $replace, $string); 
-}
-
-/**
  * Utility function for hiding errors, esp. on loading XML from bad HTML input
  */
 function _word2web_suppress_errors() { }
-
-/**
- * Utility function for crunching XML through XSLT
- */
-function _word2web_xslt_transform($xml, $xsl_file, $params = array()) {
-  // load specified stylesheet and set any parameters
-  // Without Domdocument charsets: french breaks, arabic remains broken
-  // 
-  $xsl = new DOMDocument();
-  $xsl->load($xsl_file);
-  $xslt = new XSLTProcessor();
-  $xslt->importStylesheet($xsl);
-  // check whether input is string or object
-  if (!is_object($xml)) {
-    $x = new DOMDocument();
-    $x->loadHTML($xml);
-  }
-  else {
-    $x = $xml;
-  }
-  // return transformed xml
-  return $xslt->transformToXML($x);
-}
-
Index: word2web_filter.info
===================================================================
RCS file: word2web_filter.info
diff -N word2web_filter.info
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ word2web_filter.info	22 Apr 2009 22:29:59 -0000
@@ -0,0 +1,4 @@
+; $Id$
+name = MS Word Filter
+description = Removes pesky Word tags added from "HTML" code pasted from MS Word.
+core = 6.x
Index: word2web_filter.module
===================================================================
RCS file: word2web_filter.module
diff -N word2web_filter.module
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ word2web_filter.module	22 Apr 2009 22:29:59 -0000
@@ -0,0 +1,59 @@
+<?php
+// $Id$
+/**
+ * @file
+ * Filter Microsoft Word tags.
+ */
+
+/**
+ * Implementation of hook_filter().
+ */
+function word2web_filter_filter($op, $delta = 0, $format = -1, $text = '') {
+  //
+  switch ($op) {
+    case 'list':
+      return array(0 => t('MS Word Cleanup'));
+
+    case 'description':
+      return t('Removes pesky Word tags added from "HTML" code pasted from MS Word.');
+
+    case 'no cache':
+      return false;
+
+    case 'settings':
+      return _word2web_filter_settings($format);
+
+    case 'process':
+      // Include our helper file.
+      module_load_include('inc', 'word2web');
+      return _word2web_filter($text, variable_get("word2web_strip_images_$format", FALSE));
+
+    default:
+      return $text;
+  }
+}
+
+/**
+ * Setting form for word2web MS Word cleanup filter.
+ *
+ * @param $format
+ * The filter format name.
+ * @return
+ * FormAPI array.
+ */
+function _word2web_filter_settings($format) {
+  $form['word2web_filter'] = array(
+    '#type' => 'fieldset',
+    '#title' => t('MS Word Cleanup'),
+    '#collapsible' => TRUE,
+  );
+
+  $form['word2web_filter']["word2web_strip_images_$format"] = array(
+    '#type' => 'checkbox',
+    '#title' => t('Strip Word image tags.'),
+    '#default_value' => variable_get("word2web_strip_images_$format", FALSE),
+    '#description' => t('If enabled, Microsoft Word image tags will be striped. Otherwise they will be converted to html image tags.<br /> Note: The filter will not make sure these images exist.'),
+  );
+
+  return $form;
+}