=== modified file 'sites/all/modules/import_html/import_html.module'
--- sites/all/modules/import_html/import_html.module	2010-02-10 00:09:24 +0000
+++ sites/all/modules/import_html/import_html.module	2010-02-11 04:51:36 +0000
@@ -383,6 +383,7 @@
     'recursion_behaviour' => IMPORT_HTML_GLOB_BEFORE,
     'debug_level' => 0,
     'keep_temp_files' => FALSE,
+    'pretidy_cmd' => '',
 
   );
 }

=== modified file 'sites/all/modules/import_html/import_html_process.inc'
--- sites/all/modules/import_html/import_html_process.inc	2010-02-09 23:13:51 +0000
+++ sites/all/modules/import_html/import_html_process.inc	2010-02-11 03:55:25 +0000
@@ -523,6 +523,65 @@
   return TRUE;
 }
 
+/**
+ * 'pre-tidy' a file: run a parser on the HTML file, _before_ running HTMLtidy on it
+ *
+ *
+ * Code largely copied from xml_tidy_file()
+ */
+function import_html_pretidy_file($filepath, $pretidy_cmd) {
+  import_html_debug("Pre-tidying file '$filepath' ");
+
+  if (! is_local($filepath)) {
+    // OK, so it's a remote file and I have to process it on the command-line ...
+    // Copy it down
+    $source = file_get_contents($filepath);
+    if (! $source) {
+      trigger_error("No content from '$filepath'", E_USER_WARNING );
+      return FALSE;
+    }
+    import_html_debug("Retrieved remote file:$filepath is ". strlen($source) ." big");
+    // put it in a temp place
+    $target_path =  tempnam($_ENV['TEMP'], "htm");
+    file_put_contents( foreslash($target_path) , $source );
+  }
+  else {
+    $target_path = $filepath;
+  }
+
+  if (! file_exists($target_path) ) {
+    import_html_debug("Attempted to pre-tidy a file that doesn't exist. Looking for $target_path failed!", array(), WATCHDOG_ERROR);
+    return;
+  }
+
+  $command = $pretidy_cmd . ' "' . foreslash($target_path) . '"';
+
+  // TODO - check for exploits here? What could happen?
+  import_html_debug("Running \n$command");
+  $result = exec($command, $response, $return_code);
+  $out = join("\n", $response);
+
+  if (!$out) {
+    // run the exact same command again, but collect the errors this time
+    $result = exec($command .' 2>&1', $response, $return_code);
+    $out = join("\n", $response);
+
+    trigger_error(
+      "The pre-tidy command failed to parse the input!
+      I ran <code>\n$command\n</code>
+      and got: $return_code <pre>". htmlspecialchars($out) ."</pre>\n" , E_USER_WARNING);
+    $out = FALSE;
+  }
+
+  import_html_debug_code("After command-line tidy", $out);
+
+  if ($target_path != $filepath) {
+    // remove temp file we just made up.
+    unlink($target_path);
+  }
+
+  return $out;
+}
 
 /**
  * Analyse a source page and create a node definition from it.
@@ -572,6 +631,28 @@
       trigger_error("Path '$path' was not found. This should have been a local copy of the file being imported, but the paths may be wrong somehow. Abject failure processing $rel_path");
     }
 
+    if ($profile['pretidy_cmd']) {
+      $data = import_html_pretidy_file($path, $profile['pretidy_cmd']);
+      if ($data) {
+        // Right; now we have 'pre-tidied' output in a string.
+        // However I don't feel safe calling parse_in_xml_string() rather than parse_in_xml_file()
+        // because that has a different 'handling'. (Is sometimes called twice.)
+        // In order not to touch existing code, we'll write to a file. Can always change that later.
+        $temp_path =  foreslash( tempnam($_ENV['TEMP'], "htm") );
+        file_put_contents($temp_path, $data);
+        if (! file_exists($temp_path)) {
+          import_html_debug(
+            "Failed to create/write temp file '%path' with pre-tidied output. Will continue parsing 'un-pre-tidied' file.",
+            array('path' => $temp_path)
+          );
+          $temp_path = '';
+        }
+      }
+    }
+    if (!$temp_path) {
+      $temp_path = $path;
+    }
+
     /*
      * Trying to parse pure XML first is causing problems
      * Either I want everything to be html, (always tidy)
@@ -581,19 +662,24 @@
      */
      // temporarily ignore parser errors (catch?)
     set_error_handler('stfu');
-    $xmldoc = parse_in_xml_file($path, $profile['force_tidy']);
+    $xmldoc = parse_in_xml_file($temp_path, $profile['force_tidy']);
     restore_error_handler();
 
     if (! $xmldoc && $profile['force_tidy'] ) {
       import_html_debug(
         "%path was not tidy enough - running tidy over it now so I can parse it.",
-        array('%path' => $path, '%rel_path' => $rel_path)
+        array('%path' => $temp_path, '%rel_path' => $rel_path)
       );
       // If a raw XML parse failed,
       // tell parse_in_xml_file() to use htmlTidy before it begins
       // TODO - add a flag to skip this double-processing, (parsing twice) it may be a bit slow if it's not often used
-      $xmldoc = parse_in_xml_file($path, TRUE);
+      $xmldoc = parse_in_xml_file($temp_path, TRUE);
+    }
+    if ($temp_path != $path) {
+      // remove temp file we just made up.
+      unlink($temp_path);
     }
+
     #import_html_debug_code("Finished reading from file:", xml_tostring($xmldoc));
     $source_node = new stdClass();
   }

=== modified file 'sites/all/modules/import_html/import_html_ui.inc'
--- sites/all/modules/import_html/import_html_ui.inc	2010-02-18 22:10:33 +0000
+++ sites/all/modules/import_html/import_html_ui.inc	2010-02-18 22:18:17 +0000
@@ -589,7 +589,17 @@
       files/import directory.
     "),
   );
-
+  $form['advanced']['pretidy_cmd'] = array(
+    '#type' => 'textfield',
+    '#title' => t("Pre-tidy command"),
+    '#default_value' => $profile['pretidy_cmd'],
+    '#description' => t("
+      A command to run on each HTML file, before any processing and before HTML Tidy is run.
+      This is only necessary if your files contain such faulty HTML that it even confuses HTML Tidy
+      (like old MS FrontPage files containing insanely placed start/end tags).
+      The command you specify will be run with the full filename appended (acting as a commandline argument). It must output the 'pre-tidied' HTML on STDOUT.
+    "),
+  );
 
   $form['advanced']['import_html_other_logic'] = array(
     '#value' => t("<p>

