From 52e4140d6d19eb464d092d24bb8bf08cd997ae42 Mon Sep 17 00:00:00 2001
From: Bob Vincent <bobvin@pillars.net>
Date: Tue, 6 Mar 2012 04:07:45 -0500
Subject: [PATCH] Issue #221257 by gpk, AlexisWilke, NancyDru, sun,
 pillarsdotnet, David_Rothstein, effulgentsia:
 text_summary() should output valid HTML and Unicode text.

---
 core/modules/field/modules/text/text.info   |    1 +
 core/modules/field/modules/text/text.module |  252 ++++++++++++++++++---------
 core/modules/field/modules/text/text.test   |  174 +++++++++----------
 3 files changed, 255 insertions(+), 172 deletions(-)

diff --git a/core/modules/field/modules/text/text.info b/core/modules/field/modules/text/text.info
index b424d2d452f05bb23498dac2e1504826c59c7ae6..e3f045eea3a698f22e0602eb837afe698c4d6c6f 100644
--- a/core/modules/field/modules/text/text.info
+++ b/core/modules/field/modules/text/text.info
@@ -4,5 +4,6 @@ package = Core
 version = VERSION
 core = 8.x
 dependencies[] = field
+dependencies[] = filter
 files[] = text.test
 required = TRUE
diff --git a/core/modules/field/modules/text/text.module b/core/modules/field/modules/text/text.module
index d73814faaafa007f2f13499616ecd17edcf24b7f..6ac43d3f7757d71767cf0358561bbb33d79eec6c 100644
--- a/core/modules/field/modules/text/text.module
+++ b/core/modules/field/modules/text/text.module
@@ -324,124 +324,216 @@ function _text_sanitize($instance, $langcode, $item, $column) {
  *
  * If the end of the summary is not indicated using the <!--break--> delimiter
  * then we generate the summary automatically, trying to end it at a sensible
- * place such as the end of a paragraph, a line break, or the end of a
- * sentence (in that order of preference).
+ * place such as the end of a paragraph, a line break, a sentence, or at a
+ * whitespace character (in that order of preference).
  *
  * @param $text
  *   The content for which a summary will be generated.
  * @param $format
- *   The format of the content.
- *   If the PHP filter is present and $text contains PHP code, we do not
- *   split it up to prevent parse errors.
- *   If the line break filter is present then we treat newlines embedded in
- *   $text as line breaks.
- *   If the htmlcorrector filter is present, it will be run on the generated
- *   summary (if different from the incoming $text).
+ *   The format of the content. This paramter is actually unnecessary since the
+ *   format will have already been applied by _text_sanitize().
  * @param $size
- *   The desired character length of the summary. If omitted, the default
- *   value will be used. Ignored if the special delimiter is present
- *   in $text.
+ *   The desired character length of the summary, not counting HTML tags. If
+ *   omitted, the default value will be used. Ignored if the special delimiter
+ *   is present in $text.
  * @return
  *   The generated summary.
  */
 function text_summary($text, $format = NULL, $size = NULL) {
-
   if (!isset($size)) {
     // What used to be called 'teaser' is now called 'summary', but
     // the variable 'teaser_length' is preserved for backwards compatibility.
     $size = variable_get('teaser_length', 600);
   }
 
-  // Find where the delimiter is in the body
+  // Find where the delimiter is in the body. Note the use of strpos() to count
+  // bytes rather than UTF-8 character sequences.
   $delimiter = strpos($text, '<!--break-->');
 
-  // If the size is zero, and there is no delimiter, the entire body is the summary.
-  if ($size == 0 && $delimiter === FALSE) {
+  // If there is no delimiter and the size is either 0 or larger than the text
+  // length, then return the full text.
+  if ($delimiter === FALSE && ($size == 0 || drupal_strlen($text) <= $size)) {
     return $text;
   }
 
+  // DOM manipulations needed to ensure valid HTML after trimming text can be
+  // slow, so check the cache first. This function does not apply a filter
+  // format, but it's related enough to the filter system to use its cache bin.
+  $cache_id = "text_summary:$size:" . drupal_hash_base64($text);
+  if ($cached = cache_get($cache_id, 'cache_filter')) {
+    return $cached->data;
+  }
+
   // If a valid delimiter has been specified, use it to chop off the summary.
   if ($delimiter !== FALSE) {
-    return substr($text, 0, $delimiter);
-  }
+    // Since there is no drupal_strpos(), we must use substr() instead of
+    // drupal_substr() here, or we'll break on UTF-8 input.
+    $output = substr($text, 0, $delimiter);
 
-  // We check for the presence of the PHP evaluator filter in the current
-  // format. If the body contains PHP code, we do not split it up to prevent
-  // parse errors.
-  if (isset($format)) {
-    $filters = filter_list_format($format);
-    if (isset($filters['php_code']) && $filters['php_code']->status && strpos($text, '<?') !== FALSE) {
-      return $text;
+    // The delimiter might be within another tag, resulting in $output now
+    // containing an unclosed tag. Fix that, but avoid the expensive DOM
+    // operations if there are no tags at all.
+    if (strpos($output, '<') !== FALSE) {
+      $output = filter_dom_serialize(filter_dom_load($output));
     }
   }
+  // Otherwise, use DOM parsing to grab the desired number of text characters,
+  // while preserving the HTML markup up to and including that text.
+  else {
+    // Generate a DOM Document to hold the full body.
+    $body_doc = filter_dom_load($text);
+    $body_node = $body_doc->documentElement;
 
-  // If we have a short body, the entire body is the summary.
-  if (drupal_strlen($text) <= $size) {
-    return $text;
-  }
-
-  // If the delimiter has not been specified, try to split at paragraph or
-  // sentence boundaries.
-
-  // The summary may not be longer than maximum length specified. Initial slice.
-  $summary = truncate_utf8($text, $size);
-
-  // Store the actual length of the UTF8 string -- which might not be the same
-  // as $size.
-  $max_rpos = strlen($summary);
+    // Generate a DOM Document to hold the summary.
+    $summary_doc = new DOMDocument();
 
-  // How much to cut off the end of the summary so that it doesn't end in the
-  // middle of a paragraph, sentence, or word.
-  // Initialize it to maximum in order to find the minimum.
-  $min_rpos = $max_rpos;
+    // Recursively copy each child node from $body_node to $summary_doc
+    // until $size limit is reached.
+    _text_summarize($body_node, $size, $summary_doc, $summary_doc);
 
-  // Store the reverse of the summary. We use strpos on the reversed needle and
-  // haystack for speed and convenience.
-  $reversed = strrev($summary);
-
-  // Build an array of arrays of break points grouped by preference.
-  $break_points = array();
-
-  // A paragraph near the end of sliced summary is most preferable.
-  $break_points[] = array('</p>' => 0);
+    // Convert the summary document back to XHTML.
+    $output = filter_dom_serialize($summary_doc);
+  }
+  // DOM automatically wraps plain-text in a <p> tag, but if the input was plain
+  // text, then the summary should be as well.
 
-  // If no complete paragraph then treat line breaks as paragraphs.
-  $line_breaks = array('<br />' => 6, '<br>' => 4);
-  // Newline only indicates a line break if line break converter
-  // filter is present.
-  if (isset($filters['filter_autop'])) {
-    $line_breaks["\n"] = 1;
+  if ($text === strip_tags($text)) {
+    $output = strip_tags($output);
   }
-  $break_points[] = $line_breaks;
+  $output = trim($output);
 
-  // If the first paragraph is too long, split at the end of a sentence.
-  $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1);
+  // Cache the result for later use.
+  cache_set($cache_id, $output, 'cache_filter');
+  return $output;
+}
 
-  // Iterate over the groups of break points until a break point is found.
-  foreach ($break_points as $points) {
-    // Look for each break point, starting at the end of the summary.
-    foreach ($points as $point => $offset) {
-      // The summary is already reversed, but the break point isn't.
-      $rpos = strpos($reversed, strrev($point));
-      if ($rpos !== FALSE) {
-        $min_rpos = min($rpos + $offset, $min_rpos);
+/**
+ * Helper function for text_summary.
+ *
+ * Recursively copies elements from $body to $summary, subtracting the length
+ * of the textContent portions from $size until $size reaches zero.
+ *
+ * @param $body
+ *   The source DOMNode.
+ * @param $size
+ *   The maximum number of textContent characters to copy.
+ * @param $summary
+ *   The destination DOMNode.
+ * @param $doc
+ *   The destination DOMDocument. Should be the same as the
+ *   $summary->ownerDocument property.
+ * @param $parents
+ *   An array of tag names of ancestor nodes.
+ *
+ * @return
+ *   The number of additional characters left to copy.
+ */
+function _text_summarize($body, $size, $summary, $doc, $parents = array()) {
+  static $sentence_splitter;
+  static $word_splitter;
+  if (!isset($sentence_splitter)) {
+    // According to http://unicode.org/review/pr-23.html, these are the Unicode
+    // Sentence_Terminal characters.
+    $stops =
+      "\x21" .         // 'Exclamation mark'.
+      "\x2E" .         // 'Full stop'.
+      "\x3F" .         // 'Question mark'.
+      "\xD6\x89" .     // 'Armenian full stop'.
+      "\xD8\x9F" .     // 'Arabic question mark'.
+      "\xDB\x94" .     // 'Arabic full stop'.
+      "\xDC\x80" .     // 'Syriac end of paragraph'.
+      "\xDC\x81" .     // 'Syriac supralinear full stop'.
+      "\xDC\x82" .     // 'Syriac sublinear full stop'.
+      "\xE0\xA5\xA4" . // 'Devanagari danda'.
+      "\xE1\x81\x8A" . // 'Myanmar sign little section'.
+      "\xE1\x81\x8B" . // 'Myanmar sign section'.
+      "\xE1\x8D\xA2" . // 'Ethiopic full stop'.
+      "\xE1\x8D\xA7" . // 'Ethiopic question mark'.
+      "\xE1\x8D\xA8" . // 'Ethiopic paragraph separator'.
+      "\xE1\x99\xAE" . // 'Canadian syllabics full stop'.
+      "\xE1\xA0\x83" . // 'Mongolian full stop'.
+      "\xE1\xA0\xA9" . // 'Mongolian manchu full stop'.
+      "\xE2\x80\xBC" . // 'Double exclamation mark'.
+      "\xE2\x80\xBD" . // 'Interrobang'.
+      "\xE2\x81\x87" . // 'Double question mark'.
+      "\xE2\x81\x88" . // 'Question exclamation mark'.
+      "\xE2\x81\x89" . // 'Exclamation question mark'.
+      "\xE3\x80\x82" . // 'Ideographic full stop'.
+      "\xEF\xB9\x92" . // 'Small full stop'.
+      "\xEF\xB9\x97" . // 'Small exclamation mark'.
+      "\xEF\xBC\x81" . // 'Fullwidth exclamation mark'.
+      "\xEF\xBC\x8E" . // 'Fullwidth full stop'.
+      "\xEF\xBC\x9E" . // 'Fullwidth question mark'.
+      "\xEF\xBD\xA1";  // 'Halfwidth ideographic full stop'.
+    // We split after Sentence_Terminal characters only if preceded by a Letter
+    // character and followed by a Separator character.
+    $sentence_splitter = '/(?<=\p{L}[' . $stops . '])(?=\p{Z})/u';
+    // If no suitable sentence break is found, we split before any Unicode
+    // separator character.
+    $word_splitter = '/(?=\p{Z})/u';
+  }
+  if ($body->nodeType === XML_TEXT_NODE) {
+    $text_length = drupal_strlen($body->textContent);
+    if ($text_length <= $size) {
+      $size -= $text_length;
+      $summary->appendChild($doc->createTextNode($body->textContent));
+      return $size;
+    }
+    // We avoid breaking text nodes within code blocks.
+    if (in_array('code', $parents)) {
+      return 0;
+    }
+    $sentences = preg_split($sentence_splitter, $body->textContent);
+    $text = '';
+    foreach ($sentences as $sentence) {
+      $sentence_length = drupal_strlen($sentence);
+      // Only add the sentence if it fits within the length limit.
+      if ($sentence_length > $size) {
+        break;
       }
+      $text .= $sentence;
+      $size -= $sentence_length;
     }
-
-    // If a break point was found in this group, slice and stop searching.
-    if ($min_rpos !== $max_rpos) {
-      // Don't slice with length 0. Length must be <0 to slice from RHS.
-      $summary = ($min_rpos === 0) ? $summary : substr($summary, 0, 0 - $min_rpos);
-      break;
+    // If no suitable sentence break was found, try to break between words.
+    if ($text === '') {
+      $words =  preg_split($word_splitter, $body->textContent);
+      foreach ($words as $word) {
+        $word_length = drupal_strlen($word);
+        // Only add the word if it fits within the length limit.
+        if ($word_length > $size) {
+          break;
+        }
+        $text .= $word;
+        $size -= $word_length;
+      }
     }
+    // Append the complete sentences or words, and return 0 to indicate
+    // completion.
+    $summary->appendChild($doc->createTextNode($text));
+    return 0;
   }
 
-  // If the htmlcorrector filter is present, apply it to the generated summary.
-  if (isset($filters['filter_htmlcorrector'])) {
-    $summary = _filter_htmlcorrector($summary);
+  // Recurse the DOM until the summary reaches the text length limit.
+  if ($body->nodeType === XML_ELEMENT_NODE) {
+    $node = $summary->appendChild($doc->createElement($body->tagName));
+    $parents[] = $body->tagName;
+    if ($body->hasAttributes()) {
+      foreach ($body->attributes as $attributeNode) {
+        $node->setAttribute($attributeNode->nodeName, $attributeNode->value);
+      }
+    }
+    if ($body->hasChildNodes()) {
+      foreach ($body->childNodes as $child) {
+        if ($size > 0) {
+          $size = _text_summarize($child, $size, $node, $doc, $parents);
+        }
+        else {
+          break;
+        }
+      }
+    }
   }
-
-  return $summary;
+  return $size;
 }
 
 /**
diff --git a/core/modules/field/modules/text/text.test b/core/modules/field/modules/text/text.test
index 2d936be773ffbc2621f9d9405454dd41294d7cc7..c17f163e64f8a1a8ddfebdbad9b1f53a1ecb81b6 100644
--- a/core/modules/field/modules/text/text.test
+++ b/core/modules/field/modules/text/text.test
@@ -258,7 +258,8 @@ class TextSummaryTestCase extends DrupalWebTestCase {
    */
   function testFirstSentenceQuestion() {
     $text = 'A question? A sentence. Another sentence.';
-    $expected = 'A question? A sentence.';
+    // The default format includes the auto-paragraph filter.
+    $expected = '<p>A question? A sentence.</p>';
     $this->callTextSummary($text, $expected, NULL, 30);
   }
 
@@ -270,11 +271,53 @@ class TextSummaryTestCase extends DrupalWebTestCase {
             'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . // 108
             'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. ' . // 103
             'Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'; // 110
-    $expected = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' .
+    $expected = '<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' .
                 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' .
-                'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.';
-    // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word.
-    $this->callTextSummary($text, $expected, NULL, 340);
+                'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.</p>';
+    // Test that sentence splitting works when we replace the full stops with
+    // any other Unicode Sentence_Terminal character listed at
+    // http://unicode.org/review/pr-23.html.
+    $stops = array(
+      "\x21", // 'Exclamation mark'.
+      "\x2E", // 'Full stop'.
+      "\x3F", // 'Question mark'.
+      "\xD6\x89", // 'Armenian full stop'.
+      "\xD8\x9F", // 'Arabic question mark'.
+      "\xDB\x94", // 'Arabic full stop'.
+      "\xDC\x80", // 'Syriac end of paragraph'.
+      "\xDC\x81", // 'Syriac supralinear full stop'.
+      "\xDC\x82", // 'Syriac sublinear full stop'.
+      "\xE0\xA5\xA4", // 'Devanagari danda'.
+      "\xE1\x81\x8A", // 'Myanmar sign little section'.
+      "\xE1\x81\x8B", // 'Myanmar sign section'.
+      "\xE1\x8D\xA2", // 'Ethiopic full stop'.
+      "\xE1\x8D\xA7", // 'Ethiopic question mark'.
+      "\xE1\x8D\xA8", // 'Ethiopic paragraph separator'.
+      "\xE1\x99\xAE", // 'Canadian syllabics full stop'.
+      "\xE1\xA0\x83", // 'Mongolian full stop'.
+      "\xE1\xA0\xA9", // 'Mongolian manchu full stop'.
+      "\xE2\x80\xBC", // 'Double exclamation mark'.
+      "\xE2\x80\xBD", // 'Interrobang'.
+      "\xE2\x81\x87", // 'Double question mark'.
+      "\xE2\x81\x88", // 'Question exclamation mark'.
+      "\xE2\x81\x89", // 'Exclamation question mark'.
+      "\xE3\x80\x82", // 'Ideographic full stop'.
+      "\xEF\xB9\x92", // 'Small full stop'.
+      "\xEF\xB9\x97", // 'Small exclamation mark'.
+      "\xEF\xBC\x81", // 'Fullwidth exclamation mark'.
+      "\xEF\xBC\x8E", // 'Fullwidth full stop'.
+      "\xEF\xBC\x9E", // 'Fullwidth question mark'.
+      "\xEF\xBD\xA1", // 'Halfwidth ideographic full stop'.
+    );
+    foreach ($stops as $stop) {
+      // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word.
+      $this->callTextSummary(
+        str_replace('.', $stop, $text),
+        str_replace('.', $stop, $expected),
+        NULL,
+        340
+      );
+    }
   }
 
   /**
@@ -286,104 +329,51 @@ class TextSummaryTestCase extends DrupalWebTestCase {
 
     // The summaries we expect text_summary() to return when $size is the index
     // of each array item.
-    // Using no text format:
-    $expected = array(
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<",
-      "<p",
-      "<p>",
-      "<p>\n",
-      "<p>\nH",
-      "<p>\nHi",
-      "<p>\nHi\n",
-      "<p>\nHi\n<",
-      "<p>\nHi\n</",
-      "<p>\nHi\n</p",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-    );
-
-    // And using a text format WITH the line-break and htmlcorrector filters.
-    $expected_lb = array(
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "",
-      "<p></p>",
-      "<p></p>",
-      "<p></p>",
-      "<p></p>",
-      "<p></p>",
-      "<p>\nHi</p>",
-      "<p>\nHi</p>",
-      "<p>\nHi</p>",
-      "<p>\nHi</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
+    // Using filtered_html format:
+    $expected = array (
+      0 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      1 => "<p></p>",
+      2 => "<p>Hi</p>",
+      3 => "<p>Hi</p>",
+      4 => "<p>Hi</p>\n<p></p>",
+      5 => "<p>Hi</p>\n<p></p>",
+      6 => "<p>Hi</p>\n<p></p>",
+      7 => "<p>Hi</p>\n<p></p>",
+      8 => "<p>Hi</p>\n<p>folks</p>",
+      9 => "<p>Hi</p>\n<p>folks</p>",
+      10 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
     );
 
     // Test text_summary() for different sizes.
-    for ($i = 0; $i <= 37; $i++) {
-      $this->callTextSummary($text, $expected[$i],    NULL, $i);
-      $this->callTextSummary($text, $expected_lb[$i], 'plain_text', $i);
-      $this->callTextSummary($text, $expected_lb[$i], 'filtered_html', $i);
+    for ($i = 0; $i <= 10; $i++) {
+      $this->callTextSummary($text, $expected[$i], 'filtered_html', $i);
     }
   }
 
   /**
+   * Test that we avoid breaking text in the middle of a CODE block.
+   */
+  function testCode() {
+    $text = 'This is an example code block:'
+      . '<code>$example = "Sentence one.  Sentence two. Sentence three.</code>';
+    $expected = '<p>This is an example code block:<code/></p>';
+    $this->callTextSummary($text, $expected, 'full_html', 65);
+  }
+
+  /**
    * Calls text_summary() and asserts that the expected teaser is returned.
    */
   function callTextSummary($text, $expected, $format = NULL, $size = NULL) {
     $summary = text_summary($text, $format, $size);
-    $this->assertIdentical($summary, $expected, t('Generated summary "@summary" matches expected "@expected".', array('@summary' => $summary, '@expected' => $expected)));
+    $replacements = array(
+      '@summary' => '"' . str_replace("\n", '\n', $summary) . '"',
+      '@expected' => '"' . str_replace("\n", '\n', $expected) . '"',
+    );
+    $comment = t(
+      'Generated summary @summary matches expected @expected.',
+      $replacements
+    );
+    $this->assertIdentical($summary, $expected, $comment);
   }
 
   /**
-- 
1.7.5.4