From e3d8e1382a403926ff3f79d32bd246ee933243a4 Mon Sep 17 00:00:00 2001
From: Bob Vincent <bobvin@pillars.net>
Date: Sat, 23 Apr 2011 14:44:21 -0400
Subject: [PATCH 1/2] Issue #299138 by catch, Kevin Hankens, drewish, arjenk, jrglasgow, stella, sun, kscheirer, lilou, pillarsdotnet: Fix the broken formatting in drupal_html_to_text() and also add tests. (test-only patch)

---
 modules/simpletest/tests/mail.test |  163 +++++++++++++++++++++++++++++++++++-
 1 files changed, 162 insertions(+), 1 deletions(-)

diff --git a/modules/simpletest/tests/mail.test b/modules/simpletest/tests/mail.test
index 8a7b152d9d32eee7ae47c9ef8b5fb9c77f4e0cf1..458636adaa9919d95e3191ca470eb822a6c8fb5a 100644
--- a/modules/simpletest/tests/mail.test
+++ b/modules/simpletest/tests/mail.test
@@ -1,6 +1,7 @@
 <?php
 
 /**
+ * @file
  * Test the Drupal mailing system.
  */
 class MailTestCase extends DrupalWebTestCase implements MailSystemInterface {
@@ -43,7 +44,7 @@ class MailTestCase extends DrupalWebTestCase implements MailSystemInterface {
   /**
    * Concatenate and wrap the e-mail body for plain-text mails.
    *
-   * @see DefaultMailSystem
+   * @see DefaultMailSystem()
    */
   public function format(array $message) {
     // Join the body array into one string.
@@ -63,3 +64,163 @@ class MailTestCase extends DrupalWebTestCase implements MailSystemInterface {
   }
 }
 
+/**
+ * Unit tests for drupal_html_to_text().
+ */
+class DrupalHtmlToTextTestCase extends DrupalUnitTestCase {
+  public static function getInfo() {
+    return array(
+      'name'  => 'HTML to text conversion',
+      'description' => 'Tests drupal_html_to_text().',
+      'group' => 'Mail',
+    );
+  }
+
+  /**
+   * Test all supported tags of drupal_html_to_text().
+   */
+  function testTags() {
+    $tests = array(
+      '<a href = "http://drupal.org">Drupal.org</a>' => "Drupal.org [1]\n[1] http://drupal.org\n",
+      '<address>Drupal</address>' => "Drupal\n",
+      '<b>Drupal</b>' => "*Drupal*",
+      '<blockquote>Drupal</blockquote>' => "> Drupal",
+      '<br />Drupal<br />Drupal' => "Drupal\nDrupal",
+      '<del>Drupal</del>' => "Drupal\n",
+      '<div>Drupal</div>' => "Drupal\n",
+      '<dl><dt>Drupal</dl>' => "Drupal\n",
+      '<dl><dt>Drupal</dt><dd>Drupal</dd></dl>' => "Drupal\n    Drupal\n",
+      '<dl><dt>Drupal<dd>Drupal</dl>' => "Drupal\n    Drupal\n",
+      '<dl>Drupal</dl>' => "Drupal\n",
+      '<dt>Drupal</dt>' => "Drupal",
+      '<em>Drupal</em>' => "/Drupal/",
+      '<h1>Drupal</h1>' => "======== DRUPAL " . str_repeat('=', 62) . "\n",
+      '<h2>Drupal</h2>' => "-------- DRUPAL " . str_repeat('-', 62) . "\n",
+      '<h3>Drupal</h3>' => ".... Drupal\n",
+      '<h4>Drupal</h4>' => ".. Drupal\n",
+      '<h5>Drupal</h5>' => "Drupal\n",
+      '<h6>Drupal</h6>' => "Drupal\n",
+      '<hr />Drupal<hr />' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78),
+      '<ins>Drupal</ins>' => "Drupal\n",
+      '<i>Drupal</i>' => "/Drupal/",
+      '<ol><li>Drupal</li></ol>' => " 1) Drupal\n",
+      '<ol><li>Drupal</li><li><ul><li>Drupal</li><li>Drupal</li></ul></li></ol>' => " 1) Drupal\n 2)  * Drupal\n     * Drupal\n",
+      '<ol><li>Drupal</li><li>Drupal</li></ol>' => " 1) Drupal\n 2) Drupal\n",
+      '<ol>Drupal</ol>' => "Drupal\n",
+      '<p>Drupal</p>' => "Drupal\n",
+      '<pre>Drupal</pre>' => "Drupal\n",
+      '<strong>Drupal</strong>' => "*Drupal*",
+      '<table><tr><td>Drupal</td><td>Drupal</td></tr><tr><td>Drupal</td><td>Drupal</td></tr></table>' => "Drupal Drupal\nDrupal Drupal\n",
+      '<ul><li>Drupal</li></ul>' => " * Drupal\n",
+      '<ul><li>Drupal <em>Drupal</em> Drupal</li></ul>' => " * Drupal /Drupal/ Drupal\n",
+      '<ul><li>Drupal</li><li><ol><li>Drupal</li><li>Drupal</li></ol></li></ul>' => " * Drupal\n *  1) Drupal\n    2) Drupal\n",
+      '<ul><li>Drupal</li><li>Drupal</li></ul>' => " * Drupal\n * Drupal\n",
+      // Tests malformed HTML tags.
+      '<br>Drupal<br>Drupal' => "Drupal\nDrupal",
+      '<hr>Drupal<hr>Drupal' => str_repeat('-', 78) . "\nDrupal\n" . str_repeat('-', 78) . "\nDrupal",
+      '<ol><li>Drupal<li>Drupal</ol>' => " 1) Drupal\n 2) Drupal\n",
+      '<ul><li>Drupal <em>Drupal</em> Drupal</ul></ul>' => " * Drupal /Drupal/ Drupal\n",
+      '<ul><li>Drupal<li>Drupal</ol>' => " * Drupal\n * Drupal\n",
+      '<ul><li>Drupal<li>Drupal</ul>' => " * Drupal\n * Drupal\n",
+      '<ul>Drupal</ul>' => "Drupal\n",
+      'Drupal</ul></ol></dl><li>Drupal' => "Drupal\n * Drupal",
+      // Tests some unsupported HTML tags.
+      '<html>Drupal</html>' => "Drupal",
+      '<script type="text/javascript">Drupal</script>' => "Drupal",
+    );
+
+    foreach ($tests as $html => $text) {
+      $result = drupal_html_to_text($html);
+      $this->assertEqual($result, $text,
+        var_export($html, TRUE)
+        . '<br />'
+        . str_replace("\n", '\n', check_plain(var_export($result, TRUE)))
+        . '<br />is equal to<br />'
+        . str_replace("\n", '\n', check_plain(var_export($text, TRUE)))
+      );
+    }
+  }
+
+  /**
+   * Test $allowed_tags argument of drupal_html_to_text().
+   */
+  function testDrupalHtmlToTextArgs() {
+    // The second parameter of drupal_html_to_text() overrules the allowed tags.
+    $result = drupal_html_to_text('Drupal <b>Drupal</b> Drupal', array('b'));
+    $this->assertEqual($result, 'Drupal *Drupal* Drupal', 'Allowed &lt;b&gt; tag found.');
+
+    $result = drupal_html_to_text('Drupal <h1>Drupal</h1> Drupal', array('b'));
+    $this->assertEqual($result, 'Drupal Drupal Drupal', 'Disallowed &lt;h1&gt; tag not found.');
+
+    $result = drupal_html_to_text('Drupal <p><em><b>Drupal</b></em><p> Drupal', array('a', 'br', 'h1'));
+    $this->assertEqual($result, 'Drupal Drupal Drupal', 'Disallowed &lt;p&gt;, &lt;em&gt;, and &lt;b&gt; tags not found.');
+
+    $result = drupal_html_to_text('<html><body>Drupal</body></html>', array('html', 'body'));
+    $this->assertEqual($result, 'Drupal', 'Unsupported &lt;html&gt; and &lt;body&gt; tags not found.');
+  }
+
+  /**
+   * Test that internal whitespace in plaintext input is preserved.
+   */
+  function testDrupalHtmltoTextPreservesWhitespace() {
+    $input = "\n \n  \nDrupal\n Drupal\n  Drupal\n \n  \n";
+    $expected = "Drupal\n Drupal\n  Drupal";
+    $result = drupal_html_to_text($input, NULL);
+    $this->assertEqual($result, $expected,
+      'Internal Whitespace is preserved:<br />'
+      . "<pre>$input</pre><br />becomes<br /><pre>$result</pre>"
+    );
+  }
+
+  /**
+   * Test that text separated by block-level tags in HTML get separated by
+   * (at least) a newline in the plaintext version.
+   */
+  function testDrupalHtmlToTextBlockTagToNewline() {
+    $input = '[text]'
+      . '<address>[address]</address>'
+      . '<blockquote>[blockquote]</blockquote>'
+      . '<br />[br]'
+      . '<del>[del]</del>'
+      . '<div>[div]</div>'
+      . '<dl><dt>[dl-dt]</dt>'
+      . '<dt>[dt]</dt>'
+      . '<dd>[dd]</dd>'
+      . '<dd>[dd]</dd></dl>'
+      . '<h1>[h1]</h1>'
+      . '<h2>[h2]</h2>'
+      . '<h3>[h3]</h3>'
+      . '<h4>[h4]</h4>'
+      . '<h5>[h5]</h5>'
+      . '<h6>[h6]</h6>'
+      . '<hr />[hr]'
+      . '<ins>[ins]</ins>'
+      . '<ol><li>[ol-li]</li>'
+      . '<li>[li]</li></ol>'
+      . '<p>[p]</p>'
+      . '<pre>[pre]</pre>'
+      . '<table><thead><tr><td>[table-thead--tr-td]</td></tr></thead>'
+      . '<tbody><tr><td>[tbody-tr-td]</td></tr>'
+      . '<tr><td>[tr-td]</td></tr></tbody></table>'
+      . '<ul><li>[ul-li]</li>'
+      . '<li>[li]</li></ul>'
+      . '[text]';
+    $output = drupal_html_to_text($input);
+    $this->assertFalse(
+      preg_match('/\][^\n]*\[/s', $output),
+      'Block-level HTML tags should force newlines: '
+      . nl2br(check_plain($output))
+    );
+    $output_upper = drupal_strtoupper($output);
+    $upper_input = drupal_strtoupper($input);
+    $upper_output = drupal_html_to_text($upper_input);
+    $this->assertEqual(
+      $upper_output,
+      $output_upper,
+      'Tag recognition should be case-insensitive:<br />'
+      . $upper_output
+      . '<br />should  be equal to <br />'
+      . $output_upper
+    );
+  }
+}
-- 
1.7.1


From fd354aad1259d8f6637d6dab93e842921095b1b9 Mon Sep 17 00:00:00 2001
From: Bob Vincent <bobvin@pillars.net>
Date: Sat, 23 Apr 2011 15:19:50 -0400
Subject: [PATCH 2/2] Issue #299138 by catch, Kevin Hankens, drewish, arjenk, jrglasgow, stella, sun, kscheirer, lilou, pillarsdotnet: Fix the broken formatting in drupal_html_to_text() and also add tests. (tests+fix patch)

---
 includes/mail.inc |  128 +++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 90 insertions(+), 38 deletions(-)

diff --git a/includes/mail.inc b/includes/mail.inc
index d2febed39686c9bf3f6f7a2bf99fa1377d09f4de..a6f7fc464f8e9118c015263fde8f01a7a1324627 100644
--- a/includes/mail.inc
+++ b/includes/mail.inc
@@ -267,7 +267,7 @@ interface MailSystemInterface {
    * @return
    *   The formatted $message.
    */
-   public function format(array $message);
+  public function format(array $message);
 
   /**
    * Send a message composed by drupal_mail().
@@ -294,7 +294,7 @@ interface MailSystemInterface {
    * @return
    *   TRUE if the mail was successfully accepted for delivery, otherwise FALSE.
    */
-   public function mail(array $message);
+  public function mail(array $message);
 }
 
 /**
@@ -303,39 +303,41 @@ interface MailSystemInterface {
  * We use delsp=yes wrapping, but only break non-spaced languages when
  * absolutely necessary to avoid compatibility issues.
  *
- * We deliberately use LF rather than CRLF, see drupal_mail().
+ * We deliberately use MAIL_LINE_ENDINGS rather than CRLF.
  *
  * @param $text
  *   The plain text to process.
  * @param $indent (optional)
  *   A string to indent the text with. Only '>' characters are repeated on
  *   subsequent wrapped lines. Others are replaced by spaces.
+ *
+ * @see drupal_mail()
  */
 function drupal_wrap_mail($text, $indent = '') {
-  // Convert CRLF into LF.
-  $text = str_replace("\r", '', $text);
+  // Convert CRLF into MAIL_LINE_ENDINGS.
+  $text = preg_replace('/\r?\n/', MAIL_LINE_ENDINGS, $text);
   // See if soft-wrapping is allowed.
   $clean_indent = _drupal_html_to_text_clean($indent);
   $soft = strpos($clean_indent, ' ') === FALSE;
   // Check if the string has line breaks.
-  if (strpos($text, "\n") !== FALSE) {
+  if (strpos($text, MAIL_LINE_ENDINGS) !== FALSE) {
     // Remove trailing spaces to make existing breaks hard.
-    $text = preg_replace('/ +\n/m', "\n", $text);
+    $text = preg_replace('/ +\r?\n/m', MAIL_LINE_ENDINGS, $text);
     // Wrap each line at the needed width.
-    $lines = explode("\n", $text);
-    array_walk($lines, '_drupal_wrap_mail_line', array('soft' => $soft, 'length' => strlen($indent)));
-    $text = implode("\n", $lines);
+    $lines = explode(MAIL_LINE_ENDINGS, $text);
+    array_walk($lines, '_drupal_wrap_mail_line', array('soft' => $soft, 'length' => drupal_strlen($indent)));
+    $text = implode(MAIL_LINE_ENDINGS, $lines);
   }
   else {
     // Wrap this line.
-    _drupal_wrap_mail_line($text, 0, array('soft' => $soft, 'length' => strlen($indent)));
+    _drupal_wrap_mail_line($text, 0, array('soft' => $soft, 'length' => drupal_strlen($indent)));
   }
   // Empty lines with nothing but spaces.
-  $text = preg_replace('/^ +\n/m', "\n", $text);
+  $text = preg_replace('/^ +\r?\n/m', MAIL_LINE_ENDINGS, $text);
   // Space-stuff special lines.
-  $text = preg_replace('/^(>| |From)/m', ' $1', $text);
+  $text = preg_replace('/^(>|From)/m', ' $1', $text);
   // Apply indentation. We only include non-'>' indentation on the first line.
-  $text = $indent . substr(preg_replace('/^/m', $clean_indent, $text), strlen($indent));
+  $text = $indent . drupal_substr(preg_replace('/^/m', $clean_indent, $text), drupal_strlen($indent));
 
   return $text;
 }
@@ -347,11 +349,18 @@ function drupal_wrap_mail($text, $indent = '') {
  * The output will be suitable for use as 'format=flowed; delsp=yes' text
  * (RFC 3676) and can be passed directly to drupal_mail() for sending.
  *
- * We deliberately use LF rather than CRLF, see drupal_mail().
+ * We deliberately use MAIL_LINE_ENDINGS rather than CRLF.
  *
  * This function provides suitable alternatives for the following tags:
- * <a> <em> <i> <strong> <b> <br> <p> <blockquote> <ul> <ol> <li> <dl> <dt>
- * <dd> <h1> <h2> <h3> <h4> <h5> <h6> <hr>
+ *
+ * <a> <address> <b> <blockquote> <br /> <dd> <dl> <dt> <em>
+ * <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <ol> <p> <strong> <ul>
+ *
+ * The following tags are also handled:
+ *
+ * <del> <div> <ins> <pre> <tr>: Rendered the same as a <p> tag.
+ *
+ * <td>: A space is inserted between adjacent table cells.
  *
  * @param $string
  *   The string to be transformed.
@@ -361,12 +370,18 @@ function drupal_wrap_mail($text, $indent = '') {
  *
  * @return
  *   The transformed string.
+ *
+ * @see drupal_mail()
  */
 function drupal_html_to_text($string, $allowed_tags = NULL) {
   // Cache list of supported tags.
   static $supported_tags;
   if (empty($supported_tags)) {
-    $supported_tags = array('a', 'em', 'i', 'strong', 'b', 'br', 'p', 'blockquote', 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr');
+    $supported_tags = array(
+      'a', 'address', 'b', 'blockquote', 'br', 'dd', 'del', 'div', 'dl', 'dt',
+      'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'ins', 'li', 'ol',
+      'p', 'pre', 'strong', 'td', 'tr', 'ul',
+    );
   }
 
   // Make sure only supported tags are kept.
@@ -379,6 +394,9 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
   $string = preg_replace('!</?(em|i)((?> +)[^>]*)?>!i', '/', $string);
   $string = preg_replace('!</?(strong|b)((?> +)[^>]*)?>!i', '*', $string);
 
+  // Separate adjacent table cells.
+  $string = preg_replace('!(</td>)?<td>!i', ' ', $string);
+
   // Replace inline <a> tags with the text of link and a footnote.
   // 'See <a href="http://drupal.org">the Drupal site</a>' becomes
   // 'See the Drupal site [1]' with the URL included as a footnote.
@@ -388,9 +406,9 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
   $urls = _drupal_html_to_mail_urls();
   $footnotes = '';
   if (count($urls)) {
-    $footnotes .= "\n";
+    $footnotes .= MAIL_LINE_ENDINGS;
     for ($i = 0, $max = count($urls); $i < $max; $i++) {
-      $footnotes .= '[' . ($i + 1) . '] ' . $urls[$i] . "\n";
+      $footnotes .= '[' . ($i + 1) . '] ' . $urls[$i] . MAIL_LINE_ENDINGS;
     }
   }
 
@@ -409,28 +427,50 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
 
     // Process HTML tags (but don't output any literally).
     if ($tag) {
-      list($tagname) = explode(' ', strtolower($value), 2);
+      list($tagname) = explode(' ', drupal_strtolower($value), 2);
       switch ($tagname) {
         // List counters
         case 'ul':
           array_unshift($lists, '*');
           break;
         case 'ol':
-          array_unshift($lists, 1);
+          // Support start attribute.
+          if (preg_match('/\bstart\s*=\s*([\'"]?)([0-9]+)\b/i', $value, $matches)) {
+            array_unshift($lists, $matches[2]);
+          }
+          else {
+            array_unshift($lists, 1);
+          }
           break;
         case '/ul':
         case '/ol':
-          array_shift($lists);
-          $chunk = ''; // Ensure blank new-line.
+          if ($lists) {
+            array_shift($lists);
+            $chunk = ''; // Ensure blank new-line.
+          }
           break;
 
         // Quotation/list markers, non-fancy headers
         case 'blockquote':
           // Format=flowed indentation cannot be mixed with lists.
-          $indent[] = count($lists) ? ' "' : '>';
+          $indent[] = count($lists) ? ' "' : '> ';
           break;
         case 'li':
-          $indent[] = is_numeric($lists[0]) ? ' ' . $lists[0]++ . ') ' : ' * ';
+          // Support value attribute.
+          if (isset($lists[0]) && is_numeric($lists[0])) {
+            $inc = ' ';
+            if (preg_match('/\bvalue\s*=\s*([\'"]?)([0-9]+)\b/i', $value, $matches)) {
+              $inc .= $matches[2];
+              $lists[0] = $matches[2] + 1;
+            }
+            else {
+              $inc .= $lists[0]++;
+            }
+            $indent[] = $inc . ') ';
+          }
+          else {
+            $indent[] = ' * ';
+          }
           break;
         case 'dd':
           $indent[] = '    ';
@@ -444,7 +484,7 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
         case '/blockquote':
           if (count($lists)) {
             // Append closing quote for inline quotes (immediately).
-            $output = rtrim($output, "> \n") . "\"\n";
+            $output = rtrim($output, "> \r\n") . '"' . MAIL_LINE_ENDINGS;
             $chunk = ''; // Ensure blank new-line.
           }
           // Fall-through
@@ -481,14 +521,24 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
         // Horizontal rulers
         case 'hr':
           // Insert immediately.
-          $output .= drupal_wrap_mail('', implode('', $indent)) . "\n";
-          $output = _drupal_html_to_text_pad($output, '-');
+          $output .= drupal_wrap_mail('', implode('', $indent));
+          if ($output) {
+            $output .= MAIL_LINE_ENDINGS;
+          }
+          $output .= str_repeat('-', 78);
           break;
-
-        // Paragraphs and definition lists
+        // Paragraphs and other block-level tags
+        case '/address':
+        case 'br':
+        case '/ins':
+        case '/del':
+        case '/div':
         case '/p':
+        case '/pre':
+        case '/tr':
         case '/dl':
           $chunk = ''; // Ensure blank new-line.
+        default:
           break;
       }
     }
@@ -509,6 +559,9 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
         $chunk = $casing($chunk);
       }
       // Format it and apply the current indentation.
+      if ($output) {
+        $output = rtrim($output) . MAIL_LINE_ENDINGS;
+      }
       $output .= drupal_wrap_mail($chunk, implode('', $indent));
       // Remove non-quotation markers from indentation.
       $indent = array_map('_drupal_html_to_text_clean', $indent);
@@ -516,7 +569,6 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
 
     $tag = !$tag;
   }
-
   return $output . $footnotes;
 }
 
@@ -527,9 +579,9 @@ function drupal_html_to_text($string, $allowed_tags = NULL) {
  */
 function _drupal_wrap_mail_line(&$line, $key, $values) {
   // Use soft-breaks only for purely quoted or unindented text.
-  $line = wordwrap($line, 77 - $values['length'], $values['soft'] ? "  \n" : "\n");
+  $line = wordwrap($line, 77 - $values['length'], ($values['soft'] ? ' ' : '') . MAIL_LINE_ENDINGS);
   // Break really long words at the maximum width allowed.
-  $line = wordwrap($line, 996 - $values['length'], $values['soft'] ? " \n" : "\n");
+  $line = wordwrap($line, 996 - $values['length'], ($values['soft'] ? ' ' : '') . MAIL_LINE_ENDINGS);
 }
 
 /**
@@ -575,12 +627,12 @@ function _drupal_html_to_text_clean($indent) {
  */
 function _drupal_html_to_text_pad($text, $pad, $prefix = '') {
   // Remove last line break.
-  $text = substr($text, 0, -1);
+  $text = preg_replace('/\r?\n$/s', '', $text);
   // Calculate needed padding space and add it.
-  if (($p = strrpos($text, "\n")) === FALSE) {
+  if (($p = strrpos($text, MAIL_LINE_ENDINGS)) === FALSE) {
     $p = -1;
   }
-  $n = max(0, 79 - (strlen($text) - $p) - strlen($prefix));
+  $n = max(0, 79 - (drupal_strlen($text) - $p) - drupal_strlen($prefix));
   // Add prefix and padding, and restore linebreak.
-  return $text . $prefix . str_repeat($pad, $n) . "\n";
+  return $text . $prefix . str_repeat($pad, $n) . MAIL_LINE_ENDINGS;
 }
-- 
1.7.1

