From 5b8d7e6222a15a443dd04016d9b010c165389fcd Mon Sep 17 00:00:00 2001 From: Bob Vincent Date: Mon, 30 May 2011 05:50:33 -0400 Subject: [PATCH] Issue #299138 by catch, Kevin Hankens, drewish, arjenk, jrglasgow, stella, sun, kscheirer, lilou, pillarsdotnet, stephandale, salvis: Improve drupal_html_to_text(). --- includes/mail.inc | 828 ++++++++++++++++++++++++++---------- modules/simpletest/tests/mail.test | 393 +++++++++++++++++ 2 files changed, 1001 insertions(+), 220 deletions(-) diff --git a/includes/mail.inc b/includes/mail.inc index be2df923427ec363f671132771e9c97ee490c090..69a2fb47313a33e92a9422efafd9d0e97e6dec25 100644 --- a/includes/mail.inc +++ b/includes/mail.inc @@ -55,7 +55,7 @@ define('MAIL_LINE_ENDINGS', isset($_SERVER['WINDIR']) || strpos($_SERVER['SERVER * $data['user'] = $params['account']; * $options['language'] = $message['language']; * user_mail_tokens($variables, $data, $options); - * switch($key) { + * switch ($key) { * case 'notice': * $langcode = $message['language']->language; * $message['subject'] = t('Notification from !site', $variables, array('langcode' => $langcode)); @@ -267,7 +267,7 @@ interface MailSystemInterface { * @return * The formatted $message. */ - public function format(array $message); + public function format(array $message); /** * Send a message composed by drupal_mail(). @@ -294,7 +294,7 @@ interface MailSystemInterface { * @return * TRUE if the mail was successfully accepted for delivery, otherwise FALSE. */ - public function mail(array $message); + public function mail(array $message); } /** @@ -303,41 +303,64 @@ interface MailSystemInterface { * We use delsp=yes wrapping, but only break non-spaced languages when * absolutely necessary to avoid compatibility issues. * - * We deliberately use LF rather than CRLF, see drupal_mail(). + * We deliberately use variable_get('mail_line_endings', MAIL_LINE_ENDINGS) + * rather than "\r\n". * * @param $text * The plain text to process. - * @param $indent (optional) - * A string to indent the text with. Only '>' characters are repeated on - * subsequent wrapped lines. Others are replaced by spaces. + * @param array $options + * (optional) An array containing one or more of the following keys: + * - indent: A string to indent the text with. Only '>' characters are + * repeated on subsequent wrapped lines. Others are replaced by spaces. + * - max: The maximum length at which to wrap each line. Defaults to 80. + * - stuff: Whether to space-stuff special lines. Defaults to TRUE. + * - hard: Whether to enforce the maximum line length even if no convenient + * space character is available. Defaults to FALSE. + * - pad: A string to use for padding short lines to 'max' characters. If + * more than one character, only the last will be repeated. + * - break: The line break sequence to insert. The default is one of the + * following: + * - "\r\n": Windows, when $text does not contain a space character. + * - "\n": Non-Windows, when $text does not contain a space character. + * - " \r\n": On Windows, when $text contains at least one space. + * - " \n": Non-Windows, when $text contains at least one space. + * + * @see drupal_mail() */ -function drupal_wrap_mail($text, $indent = '') { - // Convert CRLF into LF. - $text = str_replace("\r", '', $text); - // See if soft-wrapping is allowed. - $clean_indent = _drupal_html_to_text_clean($indent); - $soft = strpos($clean_indent, ' ') === FALSE; - // Check if the string has line breaks. - if (strpos($text, "\n") !== FALSE) { - // Remove trailing spaces to make existing breaks hard. - $text = preg_replace('/ +\n/m', "\n", $text); - // Wrap each line at the needed width. - $lines = explode("\n", $text); - array_walk($lines, '_drupal_wrap_mail_line', array('soft' => $soft, 'length' => strlen($indent))); - $text = implode("\n", $lines); +function drupal_wrap_mail($text, array $options = array()) { + static $defaults; + if (!isset($defaults)) { + $defaults = array( + 'indent' => '', + 'pad' => '', + 'pad_repeat' => '', + 'max' => 80, + 'stuff' => TRUE, + 'hard' => FALSE, + 'eol' => variable_get('mail_line_endings', MAIL_LINE_ENDINGS), + ); } - else { - // Wrap this line. - _drupal_wrap_mail_line($text, 0, array('soft' => $soft, 'length' => strlen($indent))); + $options += $defaults; + if (!isset($options['break'])) { + // Allow soft-wrap spaces only when $text contains at least one space. + $options['break'] = (strpos($text, ' ') === FALSE ? '' : ' ') . $defaults['eol']; } - // Empty lines with nothing but spaces. - $text = preg_replace('/^ +\n/m', "\n", $text); - // Space-stuff special lines. - $text = preg_replace('/^(>| |From)/m', ' $1', $text); - // Apply indentation. We only include non-'>' indentation on the first line. - $text = $indent . substr(preg_replace('/^/m', $clean_indent, $text), strlen($indent)); - - return $text; + $options['wrap'] = $options['max'] - drupal_strlen($options['indent'] . $defaults['eol']); + if ($options['pad']) { + $options['pad_repeat'] = drupal_substr($options['pad'], -1, 1); + } + // The 'clean' indent is applied to all lines after the first one. + $options['clean'] = _drupal_html_to_text_clean($options['indent']); + // Replace line breaks with platform-dependent text. + $text = preg_replace('/\r?\n/', $defaults['eol'], $text); + // Wrap lines according to RFC 3676. + $lines = explode($defaults['eol'], $text); + array_walk($lines, '_drupal_wrap_mail_line', $options); + // Expand the lines array on newly-inserted line breaks. + $lines = explode($defaults['eol'], implode($defaults['eol'], $lines)); + // Apply indentation, space-stuffing, and padding. + array_walk($lines, '_drupal_indent_mail_line', $options); + return implode($defaults['eol'], $lines); } /** @@ -347,240 +370,605 @@ function drupal_wrap_mail($text, $indent = '') { * The output will be suitable for use as 'format=flowed; delsp=yes' text * (RFC 3676) and can be passed directly to drupal_mail() for sending. * - * We deliberately use LF rather than CRLF, see drupal_mail(). + * We deliberately use variable_get('mail_line_endings', MAIL_LINE_ENDINGS) + * rather than "\r\n". * * This function provides suitable alternatives for the following tags: - *

' => " * Drupal /Drupal/ Drupal", + '
  • Drupal
  • Drupal' => " * Drupal\n * Drupal", + '
    • Drupal
    • Drupal
    ' => " * Drupal\n * Drupal", + '
      Drupal
    ' => "Drupal", + 'Drupal
  • Drupal' => "Drupal * Drupal", + '
    Drupal
    ' => "Drupal", + '
    Drupal

    Drupal

    ' => "Drupal\n\nDrupal", + '
    Drupal
    ' => "Drupal", + // Tests some unsupported HTML tags. + 'Drupal' => "Drupal", + '' => "", + ); + + foreach ($tests as $html => $text) { + $this->assertHtmlToText($html, $text, 'Supported tags'); + } + } + + /** + * Test $allowed_tags argument of drupal_html_to_text(). + */ + function testDrupalHtmlToTextArgs() { + // The second parameter of drupal_html_to_text() overrules the allowed tags. + $this->assertHtmlToText( + 'Drupal Drupal Drupal', + 'Drupal *Drupal* Drupal', + 'Allowed tag found', + array('b') + ); + $this->assertHtmlToText( + 'Drupal

    Drupal

    Drupal', + 'Drupal Drupal Drupal', + 'Disallowed

    tag not found', + array('b') + ); + + $this->assertHtmlToText( + 'Drupal

    Drupal

    Drupal', + 'Drupal Drupal Drupal', + 'Disallowed

    , , and tags not found', + array('a', 'br', 'h1') + ); + + $this->assertHtmlToText( + 'Drupal', + 'Drupal', + 'Unsupported and tags not found', + array('html', 'body') + ); + } + + /** + * Test that whitespace is collapsed, except within

     tags.
    +   */
    +  function testDrupalHtmltoTextCollapsesWhitespace() {
    +    $input = "
    Drupal  Drupal\n\nDrupal
    Drupal  Drupal\n\nDrupal
    Drupal Drupal\n\nDrupal
    "; + $collapsed = "Drupal Drupal DrupalDrupal Drupal DrupalDrupal Drupal Drupal"; + $preserved = "Drupal Drupal\n\nDrupal\nDrupal Drupal\n\nDrupal\nDrupal Drupal\n\nDrupal"; + $this->assertHtmlToText( + $input, + $collapsed, + 'Whitespace inside disallowed
     tags is collapsed',
    +      array('p')
    +    );
    +    $this->assertHtmlToText(
    +      $input,
    +      $preserved,
    +      'Whitespace inside allowed 
     tags is preserved'
    +    );
    +  }
    +
    +  /**
    +   * Test that text separated by block-level tags in HTML get separated by
    +   * (at least) a newline in the plaintext version.
    +   */
    +  function testDrupalHtmlToTextBlockTagToNewline() {
    +    $input = '[text]'
    +      . '
    [address]
    ' + . '
    [blockquote]
    ' + . '
    [br]' + . '
    [div]
    ' + . '
    [dl-dt]
    ' + . '
    [dt]
    ' + . '
    [dd]
    ' + . '
    [dd-dl]
    ' + . '

    [h1]

    ' + . '

    [h2]

    ' + . '

    [h3]

    ' + . '

    [h4]

    ' + . '
    [h5]
    ' + . '
    [h6]
    ' + . '
    [hr]' + . '
    1. [ol-li]
    2. ' + . '
    3. [li]
    4. ' + . '
    5. [li-ol]
    ' + . '

    [p]

    ' + . '
    [pre]
    ' + . '' + . '' + . '
    [table-thead--tr-td]
    [tbody-tr-td]
    [tr-td]
    ' + . '
    • [ul-li]
    • ' + . '
    • [li-ul]
    ' + . '[text]'; + $output = drupal_html_to_text($input); + $pass = $this->assertFalse( + preg_match('/\][^\n]*\[/s', $output), + 'Block-level HTML tags should force newlines' + ); + if (!$pass) { + $this->verbose($this->stringToHtml($output)); + } + $output_upper = drupal_strtoupper($output); + $upper_input = drupal_strtoupper($input); + $upper_output = drupal_html_to_text($upper_input); + $pass = $this->assertEqual( + $upper_output, + $output_upper, + 'Tag recognition should be case-insensitive' + ); + if (!$pass) { + $this->verbose( + $upper_output + . '
    should be equal to
    ' + . $output_upper + ); + } + } + + /** + * Test that headers are properly separated from surrounding text. + */ + function testHeaderSeparation() { + $html = 'Drupal

    Drupal

    Drupal'; + $text = "Drupal\n\n======== DRUPAL " . str_repeat('=', 64) . "\nDrupal"; + $this->assertHtmlToText($html, $text, + 'Text before and after

    tag'); + $html = '

    Drupal

    Drupal

    Drupal'; + $text = "Drupal\n\n\n======== DRUPAL " . str_repeat('=', 64) . "\nDrupal"; + $this->assertHtmlToText($html, $text, + 'Paragraph before and text after

    tag'); + $html = 'Drupal

    Drupal

    Drupal

    '; + $text = "Drupal\n\n======== DRUPAL " . str_repeat('=', 64) . "\n\nDrupal"; + $this->assertHtmlToText($html, $text, + 'Text before and paragraph after

    tag'); + $html = '

    Drupal

    Drupal

    Drupal

    '; + $text = "Drupal\n\n\n======== DRUPAL " . str_repeat('=', 64) . "\n\nDrupal"; + $this->assertHtmlToText($html, $text, + 'Paragraph before and after

    tag'); + } + + /** + * Test that footnote references are properly generated. + */ + function testFootnoteReferences() { + global $base_url; + $source = 'Host and path' + . '
    Host, no path' + . '
    Path, no host' + . '
    Relative path'; + $tt = "Host and path [1]" + . "\nHost, no path [2]" + . "\nPath, no host [3]" + . "\nRelative path [3]" + . "\n" + . "\n[1] http://www.example.com/node/1" + . "\n[2] http://www.example.com" + . "\n[3] $base_url/node/1"; + $this->assertHtmlToText($source, $tt, 'Footnotes'); + } + + /** + * Test that combinations of paragraph breaks, line breaks, linefeeds, + * and spaces are properly handled. + */ + function testDrupalHtmlToTextParagraphs() { + $tests = array(); + $tests[] = array( + 'html' => "

    line 1
    \nline 2
    line 3\n
    line 4

    paragraph

    ", + 'text' => "line 1\nline 2\nline 3\nline 4\n\nparagraph", + ); + $tests[] = array( + 'html' => "

    line 1
    line 2

    line 4
    line 5

    0

    ", + 'text' => "line 1\nline 2\n\nline 4\nline 5\n\n0", + ); + foreach ($tests as $test) { + $this->assertHtmlToText($test['html'], $test['text'], 'Paragraph breaks'); + } + } + + /** + * Tests that drupal_html_to_text() wraps before 1000 characters. + * + * RFC 3676 says, "The Text/Plain media type is the lowest common + * denominator of Internet email, with lines of no more than 998 characters." + * + * RFC 2046 says, "SMTP [RFC-821] allows a maximum of 998 octets before the + * next CRLF sequence." + * + * RFC 821 says, "The maximum total length of a text line including the + * is 1000 characters." + */ + function testVeryLongLineWrap() { + $input = 'Drupal
    ' . str_repeat('x', 2100) . '

    Drupal'; + $output = drupal_html_to_text($input); + // This awkward construct comes from includes/mail.inc lines 8-13. + $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS); + // We must use strlen() rather than drupal_strlen() in order to count + // octets rather than characters. + $line_length_limit = 1000 - drupal_strlen($eol); + $maximum_line_length = 0; + foreach (explode($eol, $output) as $line) { + // We must use strlen() rather than drupal_strlen() in order to count + // octets rather than characters. + $maximum_line_length = max($maximum_line_length, strlen($line . $eol)); + } + if (!$this->assertFalse($maximum_line_length > 1000, 'Mail lines are wrapped at 1000 octets.')) { + $this->verbose('Maximum line length found was ' . $maximum_line_length . ' octets.'); + } + } + + /** + * Ensure that content within
     tags is not changed.
    +   */
    +  function testNoWrapWithinPre() {
    +    // This awkward construct comes from includes/mail.inc lines 8-13.
    +    $eol = variable_get('mail_line_endings', MAIL_LINE_ENDINGS);
    +    $html = '
    '
    +       // Single space.
    +      . str_repeat('a', 30) . ' ' . str_repeat('a', 30) . ' '
    +       // Two spaces.
    +      . str_repeat('b', 30) . ' ' . str_repeat('b', 30) . '  '
    +      // Single newline.
    +      . str_repeat('c', 30) . ' ' . str_repeat('c', 30) . "$eol"
    +      // Double newline.
    +      . str_repeat('d', 30) . ' ' . str_repeat('d', 30) . "$eol$eol"
    +      // Newline and space.
    +      . str_repeat('e', 30) . ' ' . str_repeat('e', 30) . "$eol "
    +       // Newline and two spaces.
    +      . str_repeat('f', 30) . ' ' . str_repeat('f', 30) . "$eol  "
    +      // Space and newline.
    +      . str_repeat('g', 30) . ' ' . str_repeat('g', 30) . " $eol"
    +       // Two spaces and newline.
    +      . str_repeat('h', 30) . ' ' . str_repeat('h', 30) . "  $eol"
    +      . str_repeat('i', 30) . ' ' . str_repeat('i', 30) . '
    '; + $text = '' + // One space and newline. + . str_repeat('a', 30) . ' ' . str_repeat('a', 30) . " $eol" + // Two spaces and newline. + . str_repeat('b', 30) . ' ' . str_repeat('b', 30) . " $eol" + // Single newline. + . str_repeat('c', 30) . ' ' . str_repeat('c', 30) . "$eol" + // Double newline. + . str_repeat('d', 30) . ' ' . str_repeat('d', 30) . "$eol$eol" + // Newline and two spaces. + . str_repeat('e', 30) . ' ' . str_repeat('e', 30) . "$eol " + // Newline and three spaces. + . str_repeat('f', 30) . ' ' . str_repeat('f', 30) . "$eol " + // Newline only. + . str_repeat('g', 30) . ' ' . str_repeat('g', 30) . "$eol" + // Newline only. + . str_repeat('h', 30) . ' ' . str_repeat('h', 30) . "$eol" + . str_repeat('i', 30) . ' ' . str_repeat('i', 30); + $this->assertHtmlToText($html, $text, 'Soft-wrap and space-stuff text within
     according to RFC-3676');
    +  }
    +}
    -- 
    1.7.4.1