From 057a59ca3868f0b21b4e6ff1f26f1437adbf938c Mon Sep 17 00:00:00 2001
From: Bob Vincent <bobvin@pillars.net>
Date: Mon, 30 May 2011 05:50:33 -0400
Subject: [PATCH] Issue #299138 by catch, Kevin Hankens, drewish, arjenk, jrglasgow, stella, sun, kscheirer, lilou, pillarsdotnet, stephandale, salvis: Fix the broken formatting in drupal_html_to_text() and also add tests.

---
 includes/mail.inc                  |  601 +++++++++++++++++++++++-------------
 modules/simpletest/tests/mail.test |  371 ++++++++++++++++++++++
 2 files changed, 757 insertions(+), 215 deletions(-)

diff --git a/includes/mail.inc b/includes/mail.inc
index be2df923427ec363f671132771e9c97ee490c090..14bb54e340a57e75c5edf8214c69ad7a4cc35d17 100644
--- a/includes/mail.inc
+++ b/includes/mail.inc
@@ -267,7 +267,7 @@ interface MailSystemInterface {
    * @return
    *   The formatted $message.
    */
-   public function format(array $message);
+  public function format(array $message);
 
   /**
    * Send a message composed by drupal_mail().
@@ -294,7 +294,7 @@ interface MailSystemInterface {
    * @return
    *   TRUE if the mail was successfully accepted for delivery, otherwise FALSE.
    */
-   public function mail(array $message);
+  public function mail(array $message);
 }
 
 /**
@@ -303,40 +303,49 @@ interface MailSystemInterface {
  * We use delsp=yes wrapping, but only break non-spaced languages when
  * absolutely necessary to avoid compatibility issues.
  *
- * We deliberately use LF rather than CRLF, see drupal_mail().
+ * We deliberately use variable_get('mail_line_endings', MAIL_LINE_ENDINGS)
+ * rather than "\r\n".
  *
  * @param $text
  *   The plain text to process.
- * @param $indent (optional)
- *   A string to indent the text with. Only '>' characters are repeated on
- *   subsequent wrapped lines. Others are replaced by spaces.
+ * @param $indent
+ *   (optional) A string to indent the text with. Only '>' characters are
+ *   repeated on subsequent wrapped lines. Others are replaced by spaces.
+ * @param $line_length
+ *   The line length at which to wrap.
+ *
+ * @see drupal_mail()
  */
-function drupal_wrap_mail($text, $indent = '') {
-  // Convert CRLF into LF.
-  $text = str_replace("\r", '', $text);
-  // See if soft-wrapping is allowed.
+function drupal_wrap_mail($text, $indent = '', $line_length = 78) {
+  // Convert LF or CRLF into platform-specific line-endings.
+  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
+  $text = preg_replace('/\r?\n/', $eol, $text);
   $clean_indent = _drupal_html_to_text_clean($indent);
-  $soft = strpos($clean_indent, ' ') === FALSE;
+  $values = array(
+    'length' => drupal_strlen($indent),
+    'max' => $line_length,
+  );
   // Check if the string has line breaks.
-  if (strpos($text, "\n") !== FALSE) {
-    // Remove trailing spaces to make existing breaks hard.
-    $text = preg_replace('/ +\n/m', "\n", $text);
+  if (strpos($text, $eol) !== FALSE) {
     // Wrap each line at the needed width.
-    $lines = explode("\n", $text);
-    array_walk($lines, '_drupal_wrap_mail_line', array('soft' => $soft, 'length' => strlen($indent)));
-    $text = implode("\n", $lines);
+    $lines = explode($eol, $text);
+    array_walk($lines, '_drupal_wrap_mail_line', $values);
+    $text = implode($eol, $lines);
   }
   else {
     // Wrap this line.
-    _drupal_wrap_mail_line($text, 0, array('soft' => $soft, 'length' => strlen($indent)));
+    _drupal_wrap_mail_line($text, 0, $values);
   }
   // Empty lines with nothing but spaces.
-  $text = preg_replace('/^ +\n/m', "\n", $text);
+  $text = preg_replace("/^ +$eol/m", $eol, $text);
   // Space-stuff special lines.
-  $text = preg_replace('/^(>| |From)/m', ' $1', $text);
+  $text = preg_replace('/^( |>|From)/m', ' $1', $text);
+  // Strip and save trailing $eol chars to work around a bug in older PCRE
+  // libraries.
+  $stripped = rtrim($text, $eol);
+  $suffix = drupal_substr($text, drupal_strlen($stripped));
   // Apply indentation. We only include non-'>' indentation on the first line.
-  $text = $indent . substr(preg_replace('/^/m', $clean_indent, $text), strlen($indent));
-
+  $text = $indent . drupal_substr(preg_replace('/^/m', $clean_indent, $stripped), drupal_strlen($indent)) . $suffix;
   return $text;
 }
 
@@ -347,177 +356,381 @@ function drupal_wrap_mail($text, $indent = '') {
  * The output will be suitable for use as 'format=flowed; delsp=yes' text
  * (RFC 3676) and can be passed directly to drupal_mail() for sending.
  *
- * We deliberately use LF rather than CRLF, see drupal_mail().
+ * We deliberately use variable_get('mail_line_endings', MAIL_LINE_ENDINGS)
+ * rather than "\r\n".
  *
  * This function provides suitable alternatives for the following tags:
- * <a> <em> <i> <strong> <b> <br> <p> <blockquote> <ul> <ol> <li> <dl> <dt>
- * <dd> <h1> <h2> <h3> <h4> <h5> <h6> <hr>
+ *
+ * <a> <address> <b> <blockquote> <br /> <cite> <dd> <dl> <dt> <em>
+ * <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <ol> <p> <pre>
+ * <strong> <u> <ul>
+ *
+ * The following tags are also handled:
+ *
+ * <div> <tr>: Rendered the same as a <p> tag.
+ *
+ * <td>: Two spaces are inserted between adjacent table cells.
  *
  * @param $string
  *   The string to be transformed.
- * @param $allowed_tags (optional)
- *   If supplied, a list of tags that will be transformed. If omitted, all
- *   all supported tags are transformed.
+ * @param $allowed_tags
+ *   (optional) If supplied, a list of tags that will be transformed. If
+ *   omitted, all supported tags are transformed.
  *
  * @return
  *   The transformed string.
+ *
+ * @see drupal_mail()
  */
 function drupal_html_to_text($string, $allowed_tags = NULL) {
+  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
   // Cache list of supported tags.
   static $supported_tags;
-  if (empty($supported_tags)) {
-    $supported_tags = array('a', 'em', 'i', 'strong', 'b', 'br', 'p', 'blockquote', 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr');
+  if (!isset($supported_tags)) {
+    $supported_tags = array(
+      'a', 'address', 'b', 'blockquote', 'br', 'cite', 'dd', 'div', 'dl',
+      'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'li',
+      'ol', 'p', 'pre', 'strong', 'td', 'tr', 'u', 'ul',
+    );
   }
 
   // Make sure only supported tags are kept.
   $allowed_tags = isset($allowed_tags) ? array_intersect($supported_tags, $allowed_tags) : $supported_tags;
 
-  // Make sure tags, entities and attributes are well-formed and properly nested.
-  $string = _filter_htmlcorrector(filter_xss($string, $allowed_tags));
-
-  // Apply inline styles.
-  $string = preg_replace('!</?(em|i)((?> +)[^>]*)?>!i', '/', $string);
-  $string = preg_replace('!</?(strong|b)((?> +)[^>]*)?>!i', '*', $string);
-
-  // Replace inline <a> tags with the text of link and a footnote.
-  // 'See <a href="http://drupal.org">the Drupal site</a>' becomes
-  // 'See the Drupal site [1]' with the URL included as a footnote.
-  _drupal_html_to_mail_urls(NULL, TRUE);
-  $pattern = '@(<a[^>]+?href="([^"]*)"[^>]*?>(.+?)</a>)@i';
-  $string = preg_replace_callback($pattern, '_drupal_html_to_mail_urls', $string);
-  $urls = _drupal_html_to_mail_urls();
-  $footnotes = '';
-  if (count($urls)) {
-    $footnotes .= "\n";
-    for ($i = 0, $max = count($urls); $i < $max; $i++) {
-      $footnotes .= '[' . ($i + 1) . '] ' . $urls[$i] . "\n";
+  // Parse $string into a DOM tree.
+  $dom = filter_dom_load($string);
+  $notes = array();
+  $text = _drupal_html_to_text($dom->documentElement, $allowed_tags, $notes);
+  // Convert non-breaking spaces to regular spaces, and trim trailing linefeeds.
+  // chr(160) is the non-breaking space character.
+  $text = trim(str_replace(chr(160), ' ', $text), $eol);
+  // Hard-wrap at 1000 characters and space-stuff special lines.
+  $text = drupal_wrap_mail($text, '', 1000);
+  // Add footnotes;
+  if ($notes) {
+    // Add a blank line before the footnote list.
+    $text .= $eol;
+    foreach ($notes as $url => $note) {
+      $text .= $eol . '[' . $note . '] ' . $url;
     }
   }
+  return $text;
+}
 
-  // Split tags from text.
-  $split = preg_split('/<([^>]+?)>/', $string, -1, PREG_SPLIT_DELIM_CAPTURE);
-  // Note: PHP ensures the array consists of alternating delimiters and literals
-  // and begins and ends with a literal (inserting $null as required).
-
-  $tag = FALSE; // Odd/even counter (tag or no tag)
-  $casing = NULL; // Case conversion function
-  $output = '';
-  $indent = array(); // All current indentation string chunks
-  $lists = array(); // Array of counters for opened lists
-  foreach ($split as $value) {
-    $chunk = NULL; // Holds a string ready to be formatted and output.
-
-    // Process HTML tags (but don't output any literally).
-    if ($tag) {
-      list($tagname) = explode(' ', strtolower($value), 2);
-      switch ($tagname) {
-        // List counters
-        case 'ul':
-          array_unshift($lists, '*');
-          break;
-        case 'ol':
-          array_unshift($lists, 1);
-          break;
-        case '/ul':
-        case '/ol':
-          array_shift($lists);
-          $chunk = ''; // Ensure blank new-line.
-          break;
-
-        // Quotation/list markers, non-fancy headers
-        case 'blockquote':
-          // Format=flowed indentation cannot be mixed with lists.
-          $indent[] = count($lists) ? ' "' : '>';
-          break;
-        case 'li':
-          $indent[] = is_numeric($lists[0]) ? ' ' . $lists[0]++ . ') ' : ' * ';
-          break;
-        case 'dd':
-          $indent[] = '    ';
-          break;
-        case 'h3':
-          $indent[] = '.... ';
-          break;
-        case 'h4':
-          $indent[] = '.. ';
-          break;
-        case '/blockquote':
-          if (count($lists)) {
-            // Append closing quote for inline quotes (immediately).
-            $output = rtrim($output, "> \n") . "\"\n";
-            $chunk = ''; // Ensure blank new-line.
+/**
+ * Helper function for drupal_html_to_text().
+ *
+ * Recursively converts $node to text, wrapping and indenting as necessary.
+ *
+ * @param $node
+ *   The source DOMNode.
+ * @param $allowed_tags
+ *   A list of tags that will be transformed.
+ * @param $notes
+ *   The list of footnotes, an associative array of (url => reference number)
+ *   items.
+ * @param $parents
+ *   The list of ancestor tags, from nearest to most distant.
+ * @param $count
+ *   The number to use for the next list item within an ordered list.
+ * @param $line_length
+ *   The maximum length of a line, for wrapping.
+ */
+function _drupal_html_to_text(DOMNode $node, array $allowed_tags, array &$notes, array $parents = array(), &$count = NULL, $line_length = 78) {
+  if (!isset($count)) {
+    $count = 1;
+  }
+  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
+  if ($node->nodeType === XML_TEXT_NODE) {
+    // For text nodes, we just copy the text content.
+    $text = $node->textContent;
+    if (in_array('pre', $parents)) {
+      // Within <pre> tags, all spaces are non-breaking.
+      $text = str_replace(' ', chr(160), $text);
+    }
+    else {
+      // Outside <pre> tags, collapse whitespace.
+      $text = preg_replace('/[[:space:]]+/', ' ', $text);
+      // Trim spaces around newlines.
+      $text = preg_replace('/ *\n */', "\n", $text);
+    }
+    return $text;
+  }
+  // Non-text node.
+  $tag = '';
+  $text = '';
+  $child_text = '';
+  $child_count = 1;
+  $indent = '';
+  $prefix = '';
+  $suffix = '';
+  if (isset($node->tagName) && in_array($node->tagName, $allowed_tags)) {
+    $tag = $node->tagName;
+    switch ($tag) {
+      // Turn links with valid hrefs into footnotes.
+      case 'a':
+        $test = !empty($node->attributes);
+        $test = $test && ($href = $node->attributes->getNamedItem('href'));
+        $test = $test && ($url = url(ltrim($href->nodeValue, '/'), array('absolute' => TRUE)));
+        $test = $test && valid_url($url);
+        if ($test) {
+          // Only add links that have not already been added.
+          if (isset($notes[$url])) {
+            $note = $notes[$url];
+          }
+          else {
+            $note = count($notes) + 1;
+            $notes[$url] = $note;
           }
-          // Fall-through
-        case '/li':
-        case '/dd':
-          array_pop($indent);
-          break;
-        case '/h3':
-        case '/h4':
-          array_pop($indent);
-        case '/h5':
-        case '/h6':
-          $chunk = ''; // Ensure blank new-line.
-          break;
-
-        // Fancy headers
-        case 'h1':
-          $indent[] = '======== ';
-          $casing = 'drupal_strtoupper';
-          break;
-        case 'h2':
-          $indent[] = '-------- ';
-          $casing = 'drupal_strtoupper';
-          break;
-        case '/h1':
-        case '/h2':
-          $casing = NULL;
-          // Pad the line with dashes.
-          $output = _drupal_html_to_text_pad($output, ($tagname == '/h1') ? '=' : '-', ' ');
-          array_pop($indent);
-          $chunk = ''; // Ensure blank new-line.
-          break;
-
-        // Horizontal rulers
-        case 'hr':
-          // Insert immediately.
-          $output .= drupal_wrap_mail('', implode('', $indent)) . "\n";
-          $output = _drupal_html_to_text_pad($output, '-');
-          break;
-
-        // Paragraphs and definition lists
-        case '/p':
-        case '/dl':
-          $chunk = ''; // Ensure blank new-line.
-          break;
+          $suffix = ' [' . $note . ']';
+        }
+        break;
+
+      // Generic block-level tags.
+      case 'address':
+      case 'div':
+      case 'p':
+      case 'pre':
+        $text = $eol;
+        $suffix = $eol;
+        break;
+
+      // Forced line break.
+      case 'br':
+        $text = $eol;
+        break;
+
+      // Boldface by wrapping with "*" characters.
+      case 'b':
+      case 'strong':
+        $prefix = '*';
+        $suffix = '*';
+        break;
+
+      // Italicize by wrapping with "/" characters.
+      case 'cite':
+      case 'em':
+      case 'i':
+        $prefix = '/';
+        $suffix = '/';
+        break;
+
+      // Underline by wrapping with "_" characters.
+      case 'u':
+        $prefix = '_';
+        $suffix = '_';
+        break;
+
+      // Blockquotes are indented by "> " at each level.
+      case 'blockquote':
+        $text = $eol;
+        // chr(160) is the non-breaking space character.
+        $indent = '>' . chr(160);
+        $suffix = $eol;
+        break;
+
+      // Dictionary definitions are indented by four spaces.
+      case 'dd':
+        // chr(160) is the non-breaking space character.
+        $indent = chr(160) . chr(160) . chr(160) . chr(160);
+        $suffix = $eol;
+        break;
+
+      // Dictionary list.
+      case 'dl':
+        // Start on a newline except inside other lists.
+        if (!in_array('li', $parents)) {
+          $text = $eol;
+        }
+        $suffix = $eol;
+        break;
+
+      // Dictionary term.
+      case 'dt':
+        $suffix = $eol;
+        break;
+
+      // Header level 1 is prefixed by eight "=" characters.
+      case 'h1':
+        $text = "$eol$eol";
+        $indent = '======== ';
+        $suffix = $eol;
+        break;
+
+      // Header level 2 is prefixed by six "-" characters.
+      case 'h2':
+        $text = "$eol$eol";
+        $indent = '------ ';
+        $suffix = $eol;
+        break;
+
+      // Header level 3 is prefixed by four "." characters and a space.
+      case 'h3':
+        $text = "$eol$eol";
+        // chr(160) is the non-breaking space character.
+        $indent = '....' . chr(160);
+        $suffix = $eol;
+        break;
+
+      // Header level 4 is prefixed by three "." characters and a space.
+      case 'h4':
+        $text = "$eol$eol";
+        // chr(160) is the non-breaking space character.
+        $indent = '...' . chr(160);
+        $suffix = $eol;
+        break;
+
+      // Header level 5 is prefixed by two "." character and a space.
+      case 'h5':
+        $text = "$eol$eol";
+        // chr(160) is the non-breaking space character.
+        $indent = '..' . chr(160);
+        $suffix = $eol;
+        break;
+
+      // Header level 6 is prefixed by one "." character and a space.
+      case 'h6':
+        $text = "$eol$eol";
+        // chr(160) is the non-breaking space character.
+        $indent = '.' . chr(160);
+        $suffix = $eol;
+        break;
+
+      // Horizontal rulers become a line of 78 "-" characters.
+      case 'hr':
+        $text = $eol . str_repeat('-', 78) . $eol;
+        break;
+
+      // List items are treated differently depending on the parent tag.
+      case 'li':
+        // Ordered list item.
+        if (reset($parents) === 'ol') {
+          // Check the value attribute.
+          $test = !empty($node->attributes);
+          $test = $test && ($value = $node->attributes->getNamedItem('value'));
+          if ($test) {
+            $count = $value->nodeValue;
+          }
+          // chr(160) is the non-breaking space character.
+          $indent = ($count < 10 ? chr(160) : '') . chr(160) . "$count)" . chr(160);
+          $count++;
+        }
+        // Unordered list item.
+        else {
+          // chr(160) is the non-breaking space character.
+          $indent = chr(160) . '*' . chr(160);
+        }
+        $suffix = $eol;
+        break;
+
+      // Ordered lists.
+      case 'ol':
+        // Start on a newline except inside other lists.
+        if (!in_array('li', $parents)) {
+          $text = $eol;
+        }
+        // Check the start attribute.
+        $test = !empty($node->attributes);
+        $test = $test && ($value = $node->attributes->getNamedItem('start'));
+        if ($test) {
+          $child_count = $value->nodeValue;
+        }
+        break;
+
+      // Start and end tables on a new line.
+      case 'table':
+        $text = $eol;
+        $suffix = $eol;
+        break;
+
+      // Wrap table cells in space characters.
+      case 'td':
+        if (!empty($node->nextSibling)) {
+          // chr(160) is the non-breaking space character.
+          $suffix = chr(160) . chr(160);
+        }
+        break;
+
+      // End each table row with a newline.
+      case 'tr':
+        $suffix = $eol;
+        break;
+
+      // Unordered lists.
+      case 'ul':
+        // Start on a newline except inside other lists.
+        if (!in_array('li', $parents)) {
+          $text = $eol;
+        }
+        break;
+
+    default:
+        // Coder review complains if there is no default case.
+        break;
+    }
+    // Only add allowed tags to the $parents array.
+    array_unshift($parents, $tag);
+  }
+  // Copy each child node to output.
+  if ($node->hasChildNodes()) {
+    foreach ($node->childNodes as $child) {
+      $child_text .= _drupal_html_to_text($child, $allowed_tags, $notes, $parents, $child_count, $line_length - drupal_strlen($indent));
+    }
+  }
+  // We only add prefix and suffix if the child nodes were non-empty.
+  if (drupal_strlen($child_text)) {
+    // Don't add a newline to an existing newline.
+    if ($suffix === $eol && drupal_substr($child_text, - drupal_strlen($eol)) === $eol) {
+      $suffix = '';
+    }
+    $child_text = $prefix . $child_text . $suffix;
+    // Remove spaces around newlines, except within <pre> tags.
+    if (!in_array('pre', $parents)) {
+      $child_text = preg_replace('/ *\n */', "\n", $child_text);
+    }
+    // We capitalize the contents of h1 and h2 tags.
+    if ($tag === 'h1' || $tag === 'h2') {
+      $child_text = drupal_strtoupper($child_text);
+      // For h1 and h2 tags, pad each non-empty line with the
+      // character used for indentation.
+      $pad = drupal_substr($indent, 0, 1);
+      $lines = explode($eol, $child_text);
+      foreach ($lines as $i => $line) {
+        if (drupal_strlen($line)) {
+          $parts = explode($eol, drupal_wrap_mail($line, $indent));
+          foreach ($parts as $j => $part) {
+            $part = rtrim($part);
+            $repeat = $line_length - 2 - drupal_strlen($part);
+            if ($repeat > 0) {
+              // chr(160) is the non-breaking space character.
+              $part .= chr(160);
+              $part .= str_repeat($pad, $repeat);
+            }
+            $parts[$j] = $part;
+          }
+          $lines[$i] = implode($eol, $parts);
+        }
+        $child_text = implode($eol, $lines);
       }
     }
-    // Process blocks of text.
     else {
-      // Convert inline HTML text to plain text; not removing line-breaks or
-      // white-space, since that breaks newlines when sanitizing plain-text.
-      $value = trim(decode_entities($value));
-      if (drupal_strlen($value)) {
-        $chunk = $value;
+      if (in_array('pre', $parents)) {
+        // Regular lines wrap at 80; <pre> lines wrap at 1000.
+        $line_length += 920;
       }
-    }
-
-    // See if there is something waiting to be output.
-    if (isset($chunk)) {
-      // Apply any necessary case conversion.
-      if (isset($casing)) {
-        $chunk = $casing($chunk);
+      $child_text = wordwrap($child_text, $line_length - drupal_strlen($indent));
+      $parts = explode($eol, $child_text);
+      $clean = _drupal_html_to_text_clean($indent);
+      foreach ($parts as $i => $part) {
+        if (drupal_strlen($part)) {
+          $parts[$i] = ($i ? $clean : $indent) . $part;
+        }
       }
-      // Format it and apply the current indentation.
-      $output .= drupal_wrap_mail($chunk, implode('', $indent));
-      // Remove non-quotation markers from indentation.
-      $indent = array_map('_drupal_html_to_text_clean', $indent);
+      $child_text = implode($eol, $parts);
     }
-
-    $tag = !$tag;
+    $text .= $child_text;
   }
-
-  return $output . $footnotes;
+  return $text;
 }
 
 /**
@@ -526,61 +739,19 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
  * Wraps words on a single line.
  */
 function _drupal_wrap_mail_line(&$line, $key, $values) {
-  // Use soft-breaks only for purely quoted or unindented text.
-  $line = wordwrap($line, 77 - $values['length'], $values['soft'] ? "  \n" : "\n");
-  // Break really long words at the maximum width allowed.
-  $line = wordwrap($line, 996 - $values['length'], $values['soft'] ? " \n" : "\n");
-}
-
-/**
- * Helper function for drupal_html_to_text().
- *
- * Keeps track of URLs and replaces them with placeholder tokens.
- */
-function _drupal_html_to_mail_urls($match = NULL, $reset = FALSE) {
-  global $base_url, $base_path;
-  static $urls = array(), $regexp;
-
-  if ($reset) {
-    // Reset internal URL list.
-    $urls = array();
-  }
-  else {
-    if (empty($regexp)) {
-      $regexp = '@^' . preg_quote($base_path, '@') . '@';
-    }
-    if ($match) {
-      list(, , $url, $label) = $match;
-      // Ensure all URLs are absolute.
-      $urls[] = strpos($url, '://') ? $url : preg_replace($regexp, $base_url . '/', $url);
-      return $label . ' [' . count($urls) . ']';
-    }
-  }
-  return $urls;
+  $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
+  // Use soft-breaks only where there is no indent.
+  $break = ($values['length'] ? '' : ' ') . $eol;
+  $line = wordwrap($line, $values['max'] - $values['length'] - strlen($break), $break, TRUE);
 }
 
 /**
  * Helper function for drupal_wrap_mail() and drupal_html_to_text().
  *
- * Replace all non-quotation markers from a given piece of indentation with spaces.
+ * Replace all non-quotation markers from a given piece of indentation with
+ * non-breaking space characters.
  */
 function _drupal_html_to_text_clean($indent) {
-  return preg_replace('/[^>]/', ' ', $indent);
-}
-
-/**
- * Helper function for drupal_html_to_text().
- *
- * Pad the last line with the given character.
- */
-function _drupal_html_to_text_pad($text, $pad, $prefix = '') {
-  // Remove last line break.
-  $text = substr($text, 0, -1);
-  // Calculate needed padding space and add it.
-  if (($p = strrpos($text, "\n")) === FALSE) {
-    $p = -1;
-  }
-  $n = max(0, 79 - (strlen($text) - $p) - strlen($prefix));
-  // Add prefix and padding, and restore linebreak.
-  return $text . $prefix . str_repeat($pad, $n) . "\n";
+  // chr(160) is the non-breaking space character.
+  return preg_replace('/[^>]/', chr(160), $indent);
 }
diff --git a/modules/simpletest/tests/mail.test b/modules/simpletest/tests/mail.test
index 8a7b152d9d32eee7ae47c9ef8b5fb9c77f4e0cf1..571d107c2b7b2f95bfa71fc874fa40571a3724ff 100644
--- a/modules/simpletest/tests/mail.test
+++ b/modules/simpletest/tests/mail.test
@@ -1,6 +1,7 @@
 <?php
 
 /**
+ * @file
  * Test the Drupal mailing system.
  */
 class MailTestCase extends DrupalWebTestCase implements MailSystemInterface {
@@ -63,3 +64,373 @@ class MailTestCase extends DrupalWebTestCase implements MailSystemInterface {
   }
 }
 
+/**
+ * Unit tests for drupal_html_to_text().
+ */
+class DrupalHtmlToTextTestCase extends DrupalUnitTestCase {
+  public static function getInfo() {
+    return array(
+      'name'  => 'HTML to text conversion',
+      'description' => 'Tests drupal_html_to_text().',
+      'group' => 'Mail',
+    );
+  }
+
+  /**
+   * Converts a string to its PHP source equivalent for display in test messages.
+   *
+   * @param $text
+   *   The text string to convert.
+   *
+   * @return
+   *   An HTML representation of the text string that, when displayed in a
+   *   browser, represents the PHP source code equivalent of $text.
+   */
+  function stringToHtml($text) {
+    return '"' .
+      str_replace(
+        array("\n", ' '),
+        array('\n', '&nbsp;'),
+        check_plain($text)
+      ) . '"';
+  }
+
+  /**
+   * Helper function for testing drupal_html_to_text().
+   *
+   * @param $html
+   *   The source HTML string to be converted.
+   * @param $text
+   *   The expected result of converting $html to text.
+   * @param $message
+   *   A text message to display in the assertion message.
+   * @param $allowed_tags
+   *   (optional) An array of allowed tags, or NULL to default to the full
+   *   set of tags supported by drupal_html_to_text().
+   */
+  function assertHtmlToText($html, $text, $message, $allowed_tags = NULL) {
+    $result = drupal_html_to_text($html, $allowed_tags);
+    $pass = $this->assertEqual($result, $text, check_plain($message));
+    if (!$pass) {
+      $this->verbose('html = <pre>' . $this->stringToHtml($html)
+        . '</pre><br />' . 'result = <pre>' . $this->stringToHtml($result)
+        . '</pre><br />' . 'expected = <pre>' . $this->stringToHtml($text)
+        . '</pre>');
+    }
+  }
+
+  /**
+   * Test all supported tags of drupal_html_to_text().
+   */
+  function testTags() {
+    $tests = array(
+      '<a href = "http://drupal.org">Drupal.org</a>' => "Drupal.org [1]\n\n[1] http://drupal.org",
+      '<a href = "/">Homepage</a>' => "Homepage [1]\n\n[1] " . url('', array('absolute' => TRUE)),
+      '<address>Drupal</address>' => "Drupal",
+      '<address>Drupal</address><address>Drupal</address>' => "Drupal\n\nDrupal",
+      '<b>Drupal</b>' => "*Drupal*",
+      '<blockquote>Drupal</blockquote>' => " > Drupal",
+      '<blockquote>Drupal</blockquote><blockquote>Drupal</blockquote>' => " > Drupal\n\n > Drupal",
+      '<br />Drupal<br />Drupal<br /><br />Drupal' => "Drupal\nDrupal\n\nDrupal",
+      '<br/>Drupal<br/>Drupal<br/><br/>Drupal' => "Drupal\nDrupal\n\nDrupal",
+      '<br/>Drupal<br/>Drupal<br/><br/>Drupal<p>Drupal</p>' => "Drupal\nDrupal\n\nDrupal\nDrupal",
+      '<div>Drupal</div>' => "Drupal",
+      '<div>Drupal</div><div>Drupal</div>' => "Drupal\n\nDrupal",
+      '<em>Drupal</em>' => "/Drupal/",
+      '<h1>Drupal</h1>' => "======== DRUPAL " . str_repeat('=', 61),
+      '<h1>Drupal</h1><p>Drupal</p>' => "======== DRUPAL " . str_repeat('=', 61) . "\n\nDrupal",
+      '<h2>Drupal</h2>' => "------ DRUPAL " . str_repeat('-', 63),
+      '<h2>Drupal</h2><p>Drupal</p>' => "------ DRUPAL " . str_repeat('-', 63) . "\n\nDrupal",
+      '<h3>Drupal</h3>' => ".... Drupal",
+      '<h3>Drupal</h3><p>Drupal</p>' => ".... Drupal\n\nDrupal",
+      '<h4>Drupal</h4>' => "... Drupal",
+      '<h4>Drupal</h4><p>Drupal</p>' => "... Drupal\n\nDrupal",
+      '<h5>Drupal</h5>' => ".. Drupal",
+      '<h5>Drupal</h5><p>Drupal</p>' => ".. Drupal\n\nDrupal",
+      '<h6>Drupal</h6>' => ". Drupal",
+      '<h6>Drupal</h6><p>Drupal</p>' => ". Drupal\n\nDrupal",
+      '<hr />Drupal<hr />' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78),
+      '<hr/>Drupal<hr/>' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78),
+      '<hr/>Drupal<hr/><p>Drupal</p>' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78) . "\n\nDrupal",
+      '<i>Drupal</i>' => "/Drupal/",
+      '<p>Drupal</p>' => "Drupal",
+      '<p>Drupal</p><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<pre>Drupal</pre>' => "Drupal",
+      '<pre>Drupal</pre>Drupal' => "Drupal\nDrupal",
+      '<pre>Drupal</pre><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<strong>Drupal</strong>' => "*Drupal*",
+      '<table><tr><td>Drupal</td><td>Drupal</td></tr><tr><td>Drupal</td><td>Drupal</td></tr></table>' => "Drupal  Drupal\nDrupal  Drupal",
+      '<table><tr><td>Drupal</td></tr></table><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<u>Drupal</u>' => "_Drupal_",
+      '<ul><li>Drupal</li></ul>' => "  * Drupal",
+      '<ul><li>Drupal <em>Drupal</em> Drupal</li></ul>' => "  * Drupal /Drupal/ Drupal",
+      '<ul><li>Drupal</li><li><ol><li>Drupal</li><li>Drupal</li></ol></li></ul>' => "  * Drupal\n  *   1) Drupal\n      2) Drupal",
+      '<ul><li>Drupal</li><li><ol><li>Drupal</li></ol></li><li>Drupal</li></ul>' => "  * Drupal\n  *   1) Drupal\n  * Drupal",
+      '<ul><li>Drupal</li><li>Drupal</li></ul>' => "  * Drupal\n  * Drupal",
+      '<ul><li>Drupal</li></ul><p>Drupal</p>' => "  * Drupal\n\nDrupal",
+      '<ol><li>Drupal</li></ol>' => "   1) Drupal",
+      '<ol><li>Drupal</li><li><ul><li>Drupal</li><li>Drupal</li></ul></li></ol>' => "   1) Drupal\n   2)  * Drupal\n       * Drupal",
+      '<ol><li>Drupal</li><li>Drupal</li></ol>' => "   1) Drupal\n   2) Drupal",
+      '<ol>Drupal</ol>' => "Drupal",
+      '<ol><li>Drupal</li></ol><p>Drupal</p>' => "   1) Drupal\n\nDrupal",
+      '<dl><dt>Drupal</dt></dl>' => "Drupal",
+      '<dl><dt>Drupal</dt><dd>Drupal</dd></dl>' => "Drupal\n     Drupal",
+      '<dl><dt>Drupal</dt><dd>Drupal</dd><dt>Drupal</dt><dd>Drupal</dd></dl>' => "Drupal\n     Drupal\nDrupal\n     Drupal",
+      '<dl><dt>Drupal</dt><dd>Drupal</dd></dl><p>Drupal</p>' => "Drupal\n     Drupal\n\nDrupal",
+      '<dl><dt>Drupal<dd>Drupal</dl>' => "Drupal\n     Drupal",
+      '<dl><dt>Drupal</dt></dl><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<ul><li>Drupal</li><li><dl><dt>Drupal</dt><dd>Drupal</dd><dt>Drupal</dt><dd>Drupal</dd></dl></li><li>Drupal</li></ul>' => "  * Drupal\n  * Drupal\n        Drupal\n    Drupal\n        Drupal\n  * Drupal",
+      // Tests malformed HTML tags.
+      '<br>Drupal<br>Drupal' => "Drupal\nDrupal",
+      '<hr>Drupal<hr>Drupal' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78) . "\nDrupal",
+      '<ol><li>Drupal<li>Drupal</ol>' => "   1) Drupal\n   2) Drupal",
+      '<ul><li>Drupal <em>Drupal</em> Drupal</ul></ul>' => "  * Drupal /Drupal/ Drupal",
+      '<ul><li>Drupal<li>Drupal</ol>' => "  * Drupal\n  * Drupal",
+      '<ul><li>Drupal<li>Drupal</ul>' => "  * Drupal\n  * Drupal",
+      '<ul>Drupal</ul>' => "Drupal",
+      'Drupal</ul></ol></dl><li>Drupal' => "Drupal * Drupal",
+      '<dl>Drupal</dl>' => "Drupal",
+      '<dl>Drupal</dl><p>Drupal</p>' => "Drupal\n\nDrupal",
+      '<dt>Drupal</dt>' => "Drupal",
+      // Tests some unsupported HTML tags.
+      '<html>Drupal</html>' => "Drupal",
+      '<script type="text/javascript">Drupal</script>' => "",
+    );
+
+    foreach ($tests as $html => $text) {
+      $this->assertHtmlToText($html, $text, 'All supported tags work as expected.');
+    }
+  }
+
+  /**
+   * Test $allowed_tags argument of drupal_html_to_text().
+   */
+  function testDrupalHtmlToTextArgs() {
+    // The second parameter of drupal_html_to_text() overrules the allowed tags.
+    $this->assertHtmlToText(
+      'Drupal <b>Drupal</b> Drupal',
+      'Drupal *Drupal* Drupal',
+      'Allowed <b> tag found.',
+      array('b')
+    );
+    $this->assertHtmlToText(
+      'Drupal <h1>Drupal</h1> Drupal',
+      'Drupal Drupal Drupal',
+      'Disallowed <h1> tag not found.',
+      array('b')
+    );
+
+    $this->assertHtmlToText(
+      'Drupal <p><em><b>Drupal</b></em><p> Drupal',
+      'Drupal Drupal Drupal',
+      'Disallowed <p>, <em>, and <b> tags not found.',
+      array('a', 'br', 'h1')
+    );
+
+    $this->assertHtmlToText(
+      '<html><body>Drupal</body></html>',
+      'Drupal',
+      'Unsupported <html> and <body> tags not found.',
+      array('html', 'body')
+    );
+  }
+
+  /**
+   * Test that whitespace is collapsed, except within <pre> tags.
+   */
+  function testDrupalHtmltoTextCollapsesWhitespace() {
+    $input = "<pre>Drupal  Drupal\n\nDrupal<pre>Drupal  Drupal\n\nDrupal</pre>Drupal  Drupal\n\nDrupal</pre>";
+    $collapsed = "Drupal Drupal DrupalDrupal Drupal DrupalDrupal Drupal Drupal";
+    $preserved = "Drupal  Drupal\n\nDrupal\nDrupal  Drupal\n\nDrupal\nDrupal  Drupal\n\nDrupal";
+    $this->assertHtmlToText(
+      $input,
+      $collapsed,
+      'Whitespace inside disallowed <pre> tags is collapsed:<br />',
+      array('p')
+    );
+    $this->assertHtmlToText(
+      $input,
+      $preserved,
+      'Whitespace inside allowed <pre> tags is preserved:<br />'
+    );
+  }
+
+  /**
+   * Test that text separated by block-level tags in HTML get separated by
+   * (at least) a newline in the plaintext version.
+   */
+  function testDrupalHtmlToTextBlockTagToNewline() {
+    $input = '[text]'
+      . '<address>[address]</address>'
+      . '<blockquote>[blockquote]</blockquote>'
+      . '<br />[br]'
+      . '<div>[div]</div>'
+      . '<dl><dt>[dl-dt]</dt>'
+      . '<dt>[dt]</dt>'
+      . '<dd>[dd]</dd>'
+      . '<dd>[dd-dl]</dd></dl>'
+      . '<h1>[h1]</h1>'
+      . '<h2>[h2]</h2>'
+      . '<h3>[h3]</h3>'
+      . '<h4>[h4]</h4>'
+      . '<h5>[h5]</h5>'
+      . '<h6>[h6]</h6>'
+      . '<hr />[hr]'
+      . '<ol><li>[ol-li]</li>'
+      . '<li>[li]</li>'
+      . '<li>[li-ol]</li></ol>'
+      . '<p>[p]</p>'
+      . '<pre>[pre]</pre>'
+      . '<table><thead><tr><td>[table-thead--tr-td]</td></tr></thead>'
+      . '<tbody><tr><td>[tbody-tr-td]</td></tr>'
+      . '<tr><td>[tr-td]</td></tr></tbody></table>'
+      . '<ul><li>[ul-li]</li>'
+      . '<li>[li-ul]</li></ul>'
+      . '[text]';
+    $output = drupal_html_to_text($input);
+    $pass = $this->assertFalse(
+      preg_match('/\][^\n]*\[/s', $output),
+      'Block-level HTML tags should force newlines'
+    );
+    if (!$pass) {
+      $this->verbose($this->stringToHtml($output));
+    }
+    $output_upper = drupal_strtoupper($output);
+    $upper_input = drupal_strtoupper($input);
+    $upper_output = drupal_html_to_text($upper_input);
+    $pass = $this->assertEqual(
+      $upper_output,
+      $output_upper,
+      'Tag recognition should be case-insensitive'
+    );
+    if (!$pass) {
+      $this->verbose(
+        $upper_output
+        . '<br />should  be equal to <br />'
+        . $output_upper
+      );
+    }
+  }
+
+  /**
+   * Test that headers are properly separated from surrounding text.
+   */
+  function testHeaderSeparation() {
+    $html = 'Drupal<h1>Drupal</h1>Drupal';
+    $text = "Drupal\n\n======== DRUPAL " . str_repeat('=', 61) . "\nDrupal";
+    $this->assertHtmlToText($html, $text,
+      'Text before and after <h1> tag is separated as expected.');
+    $html = '<p>Drupal</p><h1>Drupal</h1>Drupal';
+    $text = "Drupal\n\n\n======== DRUPAL " . str_repeat('=', 61) . "\nDrupal";
+    $this->assertHtmlToText($html, $text,
+      'Paragraph before and text after <h1> tag is separated as expected.');
+    $html = 'Drupal<h1>Drupal</h1><p>Drupal</p>';
+    $text = "Drupal\n\n======== DRUPAL " . str_repeat('=', 61) . "\n\nDrupal";
+    $this->assertHtmlToText($html, $text,
+      'Text before and paragraph after <h1> tag is separated as expected.');
+    $html = '<p>Drupal</p><h1>Drupal</h1><p>Drupal</p>';
+    $text = "Drupal\n\n\n======== DRUPAL " . str_repeat('=', 61) . "\n\nDrupal";
+    $this->assertHtmlToText($html, $text,
+      'Paragraph before and after <h1> tag is separated as expected.');
+  }
+
+  /**
+   * Test that footnote references are properly generated.
+   */
+  function testFootnoteReferences() {
+    $source = '<a href="http://www.example.com/node/1">Host and path</a>'
+      . '<br /><a href="http://www.example.com">Host, no path</a>'
+      . '<br /><a href="/node/1">Path, no host</a>'
+      . '<br /><a href="node/1">Relative path</a>';
+    $tt = "Host and path [1]"
+      . "\nHost, no path [2]"
+      . "\nPath, no host [3]"
+      . "\nRelative path [3]"
+      . "\n"
+      . "\n[1] http://www.example.com/node/1"
+      . "\n[2] http://www.example.com"
+      . "\n[3] " . url('node/1', array('absolute' => TRUE));
+    $this->assertHtmlToText($source, $tt, 'Footnotes are as expected.');
+  }
+
+  /**
+   * Test that combinations of paragraph breaks, line breaks, linefeeds,
+   * and spaces are properly handled.
+   */
+  function testDrupalHtmlToTextParagraphs() {
+    $tests = array();
+    $tests[] = array(
+        'html' => "<p>line 1<br />\nline 2<br />line 3\n<br />line 4</p><p>paragraph</p>",
+        'text' => "line 1\nline 2\nline 3\nline 4\n\nparagraph",
+    );
+    $tests[] = array(
+      'html' => "<p>line 1<br /> line 2</p> <p>line 4<br /> line 5</p> <p>0</p>",
+      'text' => "line 1\nline 2\n\nline 4\nline 5\n\n0",
+    );
+    foreach ($tests as $test) {
+      $this->assertHtmlToText($test['html'], $test['text'], 'Paragraphs break as expected.');
+    }
+  }
+
+  /**
+   * Tests that drupal_html_to_text() wraps before 1000 characters.
+   *
+   * RFC 3676 says, "The Text/Plain media type is the lowest common
+   * denominator of Internet email, with lines of no more than 998 characters."
+   *
+   * RFC 2046 says, "SMTP [RFC-821] allows a maximum of 998 octets before the
+   * next CRLF sequence."
+   *
+   * RFC 821 says, "The maximum total length of a text line including the
+   * <CRLF> is 1000 characters."
+   */
+  function testVeryLongLineWrap() {
+    $input = 'Drupal<br /><pre>' . str_repeat('x', 2100) . '</pre><br />Drupal';
+    $output = drupal_html_to_text($input);
+    // This awkward construct comes from includes/mail.inc lines 8-13.
+    $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
+    // We must use strlen() rather than drupal_strlen() in order to count
+    // octets rather than characters.
+    $line_length_limit = 1000 - drupal_strlen($eol);
+    $maximum_line_length = 0;
+    foreach (explode($eol, $output) as $line) {
+      // We must use strlen() rather than drupal_strlen() in order to count
+      // octets rather than characters.
+      $maximum_line_length = max($maximum_line_length, drupal_strlen($line));
+    }
+    if (!$this->assertFalse($maximum_line_length > $line_length_limit, 'Mail lines are wrapped before 1000 octets.')) {
+      $this->verbose('Maximum line length found was ' . $maximum_line_length . ' octets.');
+    }
+  }
+
+  /**
+   * Ensure that content within <pre> tags is not changed.
+   */
+  function testNoWrapWithinPre() {
+    // This awkward construct comes from includes/mail.inc lines 8-13.
+    $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
+    $html = '<pre>'
+      . str_repeat('a', 30) . ' ' . str_repeat('a', 30) . ' ' // Single space.
+      . str_repeat('b', 30) . ' ' . str_repeat('b', 30) . '  ' // Two spaces.
+      . str_repeat('c', 30) . ' ' . str_repeat('c', 30) . "$eol" // Single newline.
+      . str_repeat('d', 30) . ' ' . str_repeat('d', 30) . "$eol$eol" // Double newline.
+      . str_repeat('e', 30) . ' ' . str_repeat('e', 30) . "$eol " // Newline and space.
+      . str_repeat('f', 30) . ' ' . str_repeat('f', 30) . "$eol  " // Newline and two spaces.
+      . str_repeat('g', 30) . ' ' . str_repeat('g', 30) . " $eol" // Space and newline.
+      . str_repeat('h', 30) . ' ' . str_repeat('h', 30) . "  $eol" // Two spaces and newline.
+      . str_repeat('i', 30) . ' ' . str_repeat('i', 30) . '</pre>';
+    $text = ''
+      . str_repeat('a', 30) . ' ' . str_repeat('a', 30) . "  $eol" // Two spaces and newline.
+      . str_repeat('b', 30) . ' ' . str_repeat('b', 30) . "   $eol" // Three spaces and newline.
+      . str_repeat('c', 30) . ' ' . str_repeat('c', 30) . "$eol" // Single newline.
+      . str_repeat('d', 30) . ' ' . str_repeat('d', 30) . "$eol$eol" // Double newline.
+      . str_repeat('e', 30) . ' ' . str_repeat('e', 30) . "$eol  " // Newline and two spaces.
+      . str_repeat('f', 30) . ' ' . str_repeat('f', 30) . "$eol   " // Newline and three spaces.
+      . str_repeat('g', 30) . ' ' . str_repeat('g', 30) . "$eol" // Newline only.
+      . str_repeat('h', 30) . ' ' . str_repeat('h', 30) . "$eol" // Newline only.
+      . str_repeat('i', 30) . ' ' . str_repeat('i', 30) . '</pre>';
+    // Soft wrapping and space stuffing for lines between about 78 and 998 characters long,
+    // according to http ://www.ietf.org/rfc/rfc3676.txt.
+    $this->assertHtmlToText($html, $text, 'Text within <pre> treated according to RFC 3676.');
+  }
+}
-- 
1.7.4.1

