From 6b1981e275412aea0d3deaec7418d04070c39ed7 Mon Sep 17 00:00:00 2001
From: Bob Vincent <bobvin@pillars.net>
Date: Mon, 16 May 2011 10:05:03 -0400
Subject: [PATCH] Issue #221257 by pillarsdotnet: text_summary() should be HTML-aware.

---
 modules/field/modules/text/text.info   |    1 +
 modules/field/modules/text/text.module |  217 ++++++++++++++++++++------------
 modules/field/modules/text/text.test   |  142 ++++++++-------------
 3 files changed, 189 insertions(+), 171 deletions(-)

diff --git a/modules/field/modules/text/text.info b/modules/field/modules/text/text.info
index b424d2d452f05bb23498dac2e1504826c59c7ae6..e3f045eea3a698f22e0602eb837afe698c4d6c6f 100644
--- a/modules/field/modules/text/text.info
+++ b/modules/field/modules/text/text.info
@@ -4,5 +4,6 @@ package = Core
 version = VERSION
 core = 8.x
 dependencies[] = field
+dependencies[] = filter
 files[] = text.test
 required = TRUE
diff --git a/modules/field/modules/text/text.module b/modules/field/modules/text/text.module
index 89c605cf2c046eb48d448e20d27a63ef88a8583f..cf3ba27cf755d715ba27a25b3c43a9a933f59783 100644
--- a/modules/field/modules/text/text.module
+++ b/modules/field/modules/text/text.module
@@ -330,49 +330,38 @@ function _text_sanitize($instance, $langcode, $item, $column) {
  * @param $text
  *   The content for which a summary will be generated.
  * @param $format
- *   The format of the content.
- *   If the PHP filter is present and $text contains PHP code, we do not
- *   split it up to prevent parse errors.
- *   If the line break filter is present then we treat newlines embedded in
- *   $text as line breaks.
- *   If the htmlcorrector filter is present, it will be run on the generated
- *   summary (if different from the incoming $text).
+ *   The format of the content. The $text string will be passed through
+ *   check_markup() before generating a summary.
  * @param $size
- *   The desired character length of the summary. If omitted, the default
- *   value will be used. Ignored if the special delimiter is present
- *   in $text.
+ *   The desired character length of the summary, not counting HTML tags. If
+ *   omitted, the default value will be used. Ignored if the special delimiter
+ *   is present in $text.
  * @return
  *   The generated summary.
  */
 function text_summary($text, $format = NULL, $size = NULL) {
-
-  if (!isset($size)) {
-    // What used to be called 'teaser' is now called 'summary', but
-    // the variable 'teaser_length' is preserved for backwards compatibility.
-    $size = variable_get('teaser_length', 600);
-  }
-
   // Find where the delimiter is in the body
   $delimiter = strpos($text, '<!--break-->');
 
-  // If the size is zero, and there is no delimiter, the entire body is the summary.
-  if ($size == 0 && $delimiter === FALSE) {
-    return $text;
-  }
-
   // If a valid delimiter has been specified, use it to chop off the summary.
   if ($delimiter !== FALSE) {
-    return substr($text, 0, $delimiter);
+    // Since there is no drupal_strpos(), we must use substr() instead of
+    // drupal_substr() here, or we'll break on UTF-8 input.
+    return trim(check_markup(substr($text, 0, $delimiter), $format));
   }
 
-  // We check for the presence of the PHP evaluator filter in the current
-  // format. If the body contains PHP code, we do not split it up to prevent
-  // parse errors.
-  if (isset($format)) {
-    $filters = filter_list_format($format);
-    if (isset($filters['php_code']) && $filters['php_code']->status && strpos($text, '<?') !== FALSE) {
-      return $text;
-    }
+  // Start with the trimmed, formatted version of $text.
+  $text = trim(check_markup($text, $format));
+
+  if (!isset($size)) {
+    // What used to be called 'teaser' is now called 'summary', but
+    // the variable 'teaser_length' is preserved for backwards compatibility.
+    $size = variable_get('teaser_length', 600);
+  }
+
+  // If the size is zero, the entire body is the summary.
+  if ($size == 0) {
+    return $text;
   }
 
   // If we have a short body, the entire body is the summary.
@@ -380,68 +369,130 @@ function text_summary($text, $format = NULL, $size = NULL) {
     return $text;
   }
 
-  // If the delimiter has not been specified, try to split at paragraph or
-  // sentence boundaries.
+  // Generate a DOM Document to hold the full body.
+  $body_doc = filter_dom_load($text);
+  $body_node = $body_doc->documentElement;
 
-  // The summary may not be longer than maximum length specified. Initial slice.
-  $summary = truncate_utf8($text, $size);
+  // Generate a DOM Document to hold the summary.
+  $summary_doc = new DOMDocument();
 
-  // Store the actual length of the UTF8 string -- which might not be the same
-  // as $size.
-  $max_rpos = strlen($summary);
+  // Recursively copy each child node from $body_node to $summary_doc
+  // until $size limit is reached.
+  _text_summarize($body_node, $size, $summary_doc, $summary_doc);
 
-  // How much to cut off the end of the summary so that it doesn't end in the
-  // middle of a paragraph, sentence, or word.
-  // Initialize it to maximum in order to find the minimum.
-  $min_rpos = $max_rpos;
+  // Convert the summary document back to XHTML.
+  $output = filter_dom_serialize($summary_doc);
 
-  // Store the reverse of the summary. We use strpos on the reversed needle and
-  // haystack for speed and convenience.
-  $reversed = strrev($summary);
-
-  // Build an array of arrays of break points grouped by preference.
-  $break_points = array();
-
-  // A paragraph near the end of sliced summary is most preferable.
-  $break_points[] = array('</p>' => 0);
-
-  // If no complete paragraph then treat line breaks as paragraphs.
-  $line_breaks = array('<br />' => 6, '<br>' => 4);
-  // Newline only indicates a line break if line break converter
-  // filter is present.
-  if (isset($filters['filter_autop'])) {
-    $line_breaks["\n"] = 1;
+  // DOM automatically wraps plain-text in a <p> tag, but if the original
+  // formatted version was plaintext, then the summary should be plaintext
+  // also.
+  if ($text === strip_tags($text)) {
+    $output = strip_tags($output);
   }
-  $break_points[] = $line_breaks;
-
-  // If the first paragraph is too long, split at the end of a sentence.
-  $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1);
+  return trim($output);
+}
 
-  // Iterate over the groups of break points until a break point is found.
-  foreach ($break_points as $points) {
-    // Look for each break point, starting at the end of the summary.
-    foreach ($points as $point => $offset) {
-      // The summary is already reversed, but the break point isn't.
-      $rpos = strpos($reversed, strrev($point));
-      if ($rpos !== FALSE) {
-        $min_rpos = min($rpos + $offset, $min_rpos);
-      }
+/**
+ * Helper function for text_summary.
+ *
+ * Recursively copies elements from $body to $summary, subtracting the length
+ * of the textContent portions from $size until $size reaches zero.
+ *
+ * @param $body
+ *   The source DOMNode.
+ * @param $size
+ *   The maximum number of textContent characters to copy.
+ * @param $summary
+ *   The destination DOMNode.
+ * @param $doc
+ *   The destination DOMDocument. Should be the same as the
+ *   $summary->ownerDocument property.
+ * @param $parents
+ *   An array of tag names of ancestor nodes.
+ *
+ * @return
+ *   The number of additional characters left to copy.
+ */
+function _text_summarize($body, $size, $summary, $doc, $parents = array()) {
+  static $sentence_splitter;
+  if (!isset($sentence_splitter)) {
+    // According to http://unicode.org/review/pr-23.html,
+    // these are the Unicode Sentence_Terminal characters.
+    $stops =
+      '!' . // Exclamation mark.
+      '.' . // Full stop.
+      '?' . // Question mark.
+      '?' . // Armenian full stop.
+      '?' . // Arabic question mark.
+      '?' . // Arabic full stop.
+      '?' . // Syriac end of paragraph.
+      '?' . // Syriac supralinear full stop.
+      '?' . // Syriac sublinear full stop.
+      '?' . // Devanagari danda.
+      '?' . // Myanmar sign little section.
+      '?' . // Myanmar sign section.
+      '?' . // Ethiopic full stop.
+      '?' . // Ethiopic question mark.
+      '?' . // Ethiopic paragraph separator.
+      '?' . // Canadian syllabics full stop.
+      '?' . // Mongolian full stop.
+      '?' . // Mongolian manchu full stop.
+      '?' . // Double exclamation mark.
+      '?' . // Interrobang.
+      '?' . // Double question mark.
+      '?' . // Question exclamation mark.
+      '?' . // Exclamation question mark.
+      '?' . // Ideographic full stop.
+      '?' . // Small full stop.
+      '?' . // Small exclamation mark.
+      '?' . // Fullwidth exclamation mark.
+      '?' . // Fullwidth full stop.
+      '?' . // Fullwidth question mark.
+      '?';  // Halfwidth ideographic full stop.
+    // We split after Sentence_Terminal characters only if preceded by a Letter
+    // character and followed by a Separator character.
+    $sentence_splitter = '/(?<=\p{L}[' . $stops . '])(?=\p{Z})/u';
+  }
+  if ($body->nodeType === XML_TEXT_NODE) {
+    $text_length = drupal_strlen($body->textContent);
+    if ($text_length <= $size) {
+      $size -= $text_length;
+      $summary->appendChild($doc->createTextNode($body->textContent));
+      return $size;
     }
-
-    // If a break point was found in this group, slice and stop searching.
-    if ($min_rpos !== $max_rpos) {
-      // Don't slice with length 0. Length must be <0 to slice from RHS.
-      $summary = ($min_rpos === 0) ? $summary : substr($summary, 0, 0 - $min_rpos);
-      break;
+    // We avoid breaking text nodes within CODE blocks.
+    if (in_array('CODE', $parents)) {
+      // Return zero to avoid adding subsequent text nodes.
+      return 0;
     }
+    $sentences = preg_split($sentence_splitter, $body->textContent);
+    $text = '';
+    foreach ($sentences as $sentence) {
+      $sentence_length = drupal_strlen($sentence);
+      // Only add the sentence if it fits within the length limit.
+      if ($sentence_length > $size) {
+        break;
+      }
+      $text .= $sentence;
+      $size -= $sentence_length;
+    }
+    $summary->appendChild($doc->createTextNode($text));
+    // Return zero to avoid adding subsequent text nodes.
+    return 0;
   }
-
-  // If the htmlcorrector filter is present, apply it to the generated summary.
-  if (isset($filters['filter_htmlcorrector'])) {
-    $summary = _filter_htmlcorrector($summary);
+  if ($body->hasChildNodes()) {
+    $node = $summary->appendChild($doc->createElement($body->tagName));
+    $parents[] = drupal_strtoupper($body->tagName);
+    foreach ($body->childNodes as $child) {
+      if ($size > 0) {
+        $size = _text_summarize($child, $size, $node, $doc);
+      }
+      else {
+        break;
+      }
+    }
   }
-
-  return $summary;
+  return $size;
 }
 
 /**
diff --git a/modules/field/modules/text/text.test b/modules/field/modules/text/text.test
index b42fed7e09894d352b3ecdd27431234e6ae77f72..ff3ff044c30e12bb44073e80bde98a33ef8b2be9 100644
--- a/modules/field/modules/text/text.test
+++ b/modules/field/modules/text/text.test
@@ -258,7 +258,8 @@ class TextSummaryTestCase extends DrupalWebTestCase {
    */
   function testFirstSentenceQuestion() {
     $text = 'A question? A sentence. Another sentence.';
-    $expected = 'A question? A sentence.';
+    // The default format includes the auto-paragraph filter.
+    $expected = '<p>A question? A sentence.</p>';
     $this->callTextSummary($text, $expected, NULL, 30);
   }
 
@@ -270,9 +271,9 @@ class TextSummaryTestCase extends DrupalWebTestCase {
             'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . // 108
             'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. ' . // 103
             'Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'; // 110
-    $expected = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' .
+    $expected = '<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' .
                 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' .
-                'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.';
+                'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.</p>';
     // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word.
     $this->callTextSummary($text, $expected, NULL, 340);
   }
@@ -286,95 +287,52 @@ class TextSummaryTestCase extends DrupalWebTestCase {
 
     // The summaries we expect text_summary() to return when $size is the index
     // of each array item.
-    // Using no text format:
-    $expected = array(
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<",
-      "<p",
-      "<p>",
-      "<p>\n",
-      "<p>\nH",
-      "<p>\nHi",
-      "<p>\nHi\n",
-      "<p>\nHi\n<",
-      "<p>\nHi\n</",
-      "<p>\nHi\n</p",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-    );
 
-    // And using a text format WITH the line-break and htmlcorrector filters.
-    $expected_lb = array(
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "",
-      "<p></p>",
-      "<p></p>",
-      "<p></p>",
-      "<p></p>",
-      "<p></p>",
-      "<p>\nHi</p>",
-      "<p>\nHi</p>",
-      "<p>\nHi</p>",
-      "<p>\nHi</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
-      "<p>\nHi\n</p>\n<p>\nfolks\n<br />\n!\n</p>",
+    // Using filtered_html format:
+    $expected = array (
+      0 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      1 => "<p></p>",
+      2 => "<p>Hi</p>",
+      3 => "<p>Hi</p>",
+      4 => "<p>Hi</p>\n<p></p>",
+      5 => "<p>Hi</p>\n<p></p>",
+      6 => "<p>Hi</p>\n<p></p>",
+      7 => "<p>Hi</p>\n<p></p>",
+      8 => "<p>Hi</p>\n<p>folks</p>",
+      9 => "<p>Hi</p>\n<p>folks</p>",
+      10 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      11 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      12 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      13 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      14 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      15 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      16 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      17 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      18 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      19 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      20 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      21 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      22 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      23 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      24 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      25 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      26 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      27 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      28 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      29 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      30 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      31 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      32 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      33 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      34 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      35 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      36 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
+      37 => "<p>Hi</p>\n<p>folks</p>\n<p>!</p>",
     );
 
     // Test text_summary() for different sizes.
     for ($i = 0; $i <= 37; $i++) {
-      $this->callTextSummary($text, $expected[$i],    NULL, $i);
-      $this->callTextSummary($text, $expected_lb[$i], 'plain_text', $i);
-      $this->callTextSummary($text, $expected_lb[$i], 'filtered_html', $i);
+      $this->callTextSummary($text, $expected[$i], 'filtered_html', $i);
     }
   }
 
@@ -383,7 +341,15 @@ class TextSummaryTestCase extends DrupalWebTestCase {
    */
   function callTextSummary($text, $expected, $format = NULL, $size = NULL) {
     $summary = text_summary($text, $format, $size);
-    $this->assertIdentical($summary, $expected, t('Generated summary "@summary" matches expected "@expected".', array('@summary' => $summary, '@expected' => $expected)));
+    $replacements = array(
+      '@summary' => '"' . str_replace("\n", '\n', $summary) . '"',
+      '@expected' => '"' . str_replace("\n", '\n', $expected) . '"',
+    );
+    $comment = t(
+      'Generated summary @summary matches expected @expected.',
+      $replacements
+    );
+    $this->assertIdentical($summary, $expected, $comment);
   }
 
   /**
-- 
1.7.4.1