From 0c75b075642fd808df96d092b8792b61bf0a23d1 Mon Sep 17 00:00:00 2001
From: Bob Vincent <bobvin@pillars.net>
Date: Wed, 28 Sep 2011 17:17:01 -0400
Subject: [PATCH] Issue #221257 by pillarsdotnet: text_summary() should output
 valid HTML and Unicode text.

---
 modules/node/node.module |  253 +++++++++++++++++++++++++++++++---------------
 1 files changed, 173 insertions(+), 80 deletions(-)

diff --git a/modules/node/node.module b/modules/node/node.module
index 299dfc11daed85998fcd0764381cf7cf33065d62..e72ba38d7a13bec6aaaaa54408f164f396c05326 100644
--- a/modules/node/node.module
+++ b/modules/node/node.module
@@ -287,113 +287,206 @@ function node_teaser_include_verify(&$form, &$form_state) {
  *
  * If the end of the teaser is not indicated using the <!--break--> delimiter
  * then we generate the teaser automatically, trying to end it at a sensible
- * place such as the end of a paragraph, a line break, or the end of a
- * sentence (in that order of preference).
+ * place such as the end of a paragraph, a line break, a sentence, or at a
+ * whitespace character (in that order of preference).
  *
  * @param $body
  *   The content for which a teaser will be generated.
  * @param $format
- *   The format of the content. If the content contains PHP code, we do not
- *   split it up to prevent parse errors. If the line break filter is present
- *   then we treat newlines embedded in $body as line breaks.
+ *   The format of the content. The $text string will be passed through
+ *   check_markup() before generating a summary.
  * @param $size
- *   The desired character length of the teaser. If omitted, the default
- *   value will be used. Ignored if the special delimiter is present
- *   in $body.
+ *   The desired character length of the summary, not counting HTML tags. If
+ *   omitted, the default value will be used. Ignored if the special delimiter
+ *   is present in $text.
  * @return
  *   The generated teaser.
  */
 function node_teaser($body, $format = NULL, $size = NULL) {
-
-  if (!isset($size)) {
-    $size = variable_get('teaser_length', 600);
+  // Replace NULL format with FILTER_FORMAT_DEFAULT.
+  if (!isset($format)) {
+    $format = FILTER_FORMAT_DEFAULT;
   }
 
   // Find where the delimiter is in the body
   $delimiter = strpos($body, '<!--break-->');
 
-  // If the size is zero, and there is no delimiter, the entire body is the teaser.
-  if ($size == 0 && $delimiter === FALSE) {
-    return $body;
-  }
-
-  // If a valid delimiter has been specified, use it to chop off the teaser.
+  // If a valid delimiter has been specified, use it to chop off the summary.
   if ($delimiter !== FALSE) {
-    return substr($body, 0, $delimiter);
+    // Since there is no drupal_strpos(), we must use substr() instead of
+    // drupal_substr() here, or we'll break on UTF-8 input.
+    return trim(check_markup(substr($body, 0, $delimiter), $format, FALSE));
   }
 
-  // We check for the presence of the PHP evaluator filter in the current
-  // format. If the body contains PHP code, we do not split it up to prevent
-  // parse errors.
-  if (isset($format)) {
-    $filters = filter_list_format($format);
-    if (isset($filters['php/0']) && strpos($body, '<?') !== FALSE) {
-      return $body;
-    }
+  // Start with the trimmed, formatted version of $body.
+  $body = trim(check_markup($body, $format, FALSE));
+  error_log("$body\n", 3, '/tmp/teaser.log');
+
+  if (!isset($size)) {
+    // What used to be called 'teaser' is now called 'summary', but
+    // the variable 'teaser_length' is preserved for backwards compatibility.
+    $size = variable_get('teaser_length', 600);
+  }
+
+  // If the size is zero, the entire body is the summary.
+  if ($size == 0) {
+    return $body;
   }
 
-  // If we have a short body, the entire body is the teaser.
+  // If we have a short body, the entire body is the summary.
   if (drupal_strlen($body) <= $size) {
     return $body;
   }
 
-  // If the delimiter has not been specified, try to split at paragraph or
-  // sentence boundaries.
+  // Generate a DOM Document to hold the full body.
+  $body_doc = new DomDocument();
+  // The following is copied from the D7 filter_dom_load() function.
+  @$body_doc->loadHTML('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>' . $body . '</body></html>');
+  $body_node = $body_doc->getElementsByTagName('body')->item(0);
 
-  // The teaser may not be longer than maximum length specified. Initial slice.
-  $teaser = truncate_utf8($body, $size);
+  // Generate a DOM Document to hold the summary.
+  $summary_doc = new DOMDocument();
 
-  // Store the actual length of the UTF8 string -- which might not be the same
-  // as $size.
-  $max_rpos = strlen($teaser);
+  // Recursively copy each child node from $body_node to $summary_doc
+  // until $size limit is reached.
+  _text_summarize($body_node, $size, $summary_doc, $summary_doc);
 
-  // How much to cut off the end of the teaser so that it doesn't end in the
-  // middle of a paragraph, sentence, or word.
-  // Initialize it to maximum in order to find the minimum.
-  $min_rpos = $max_rpos;
-
-  // Store the reverse of the teaser.  We use strpos on the reversed needle and
-  // haystack for speed and convenience.
-  $reversed = strrev($teaser);
-
-  // Build an array of arrays of break points grouped by preference.
-  $break_points = array();
-
-  // A paragraph near the end of sliced teaser is most preferable.
-  $break_points[] = array('</p>' => 0);
-
-  // If no complete paragraph then treat line breaks as paragraphs.
-  $line_breaks = array('<br />' => 6, '<br>' => 4);
-  // Newline only indicates a line break if line break converter
-  // filter is present.
-  if (isset($filters['filter/1'])) {
-    $line_breaks["\n"] = 1;
+  // Convert the summary document back to XHTML.  Note that this version lacks
+  // the CDATA escaping functionality of the D7 filter_dom_serialize() function.
+  $body_node = $summary_doc->getElementsByTagName('body')->item(0);
+  $output = '';
+  foreach ($body_node->childNodes as $child_node) {
+    $output .= $summary_doc->saveXML($child_node);
   }
-  $break_points[] = $line_breaks;
+  $output = preg_replace('|<([^> ]*)/>|', '<$1 />', $output);
 
-  // If the first paragraph is too long, split at the end of a sentence.
-  $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1);
+  // DOM automatically wraps plain-text in a <p> tag, but if the original
+  // formatted version was plaintext, then the summary should be plaintext
+  // also.
+  if ($body === strip_tags($body)) {
+    $output = strip_tags($output);
+  }
+  return trim($output). "\r\n<!--break-->\r\n";
+}
 
-  // Iterate over the groups of break points until a break point is found.
-  foreach ($break_points as $points) {
-    // Look for each break point, starting at the end of the teaser.
-    foreach ($points as $point => $offset) {
-      // The teaser is already reversed, but the break point isn't.
-      $rpos = strpos($reversed, strrev($point));
-      if ($rpos !== FALSE) {
-        $min_rpos = min($rpos + $offset, $min_rpos);
+/**
+ * Helper function for text_summary.
+ *
+ * Recursively copies elements from $body to $summary, subtracting the length
+ * of the textContent portions from $size until $size reaches zero.
+ *
+ * @param $body
+ *   The source DOMNode.
+ * @param $size
+ *   The maximum number of textContent characters to copy.
+ * @param $summary
+ *   The destination DOMNode.
+ * @param $doc
+ *   The destination DOMDocument. Should be the same as the
+ *   $summary->ownerDocument property.
+ * @param $parents
+ *   An array of tag names of ancestor nodes.
+ *
+ * @return
+ *   The number of additional characters left to copy.
+ */
+function _text_summarize($body, $size, $summary, $doc, $parents = array()) {
+  static $sentence_splitter;
+  static $word_splitter;
+  if (!isset($sentence_splitter)) {
+    // According to http://unicode.org/review/pr-23.html, these are the Unicode
+    // Sentence_Terminal characters.
+    $stops =
+      "\x21" .         // 'Exclamation mark'.
+      "\x2E" .         // 'Full stop'.
+      "\x3F" .         // 'Question mark'.
+      "\xD6\x89" .     // 'Armenian full stop'.
+      "\xD8\x9F" .     // 'Arabic question mark'.
+      "\xDB\x94" .     // 'Arabic full stop'.
+      "\xDC\x80" .     // 'Syriac end of paragraph'.
+      "\xDC\x81" .     // 'Syriac supralinear full stop'.
+      "\xDC\x82" .     // 'Syriac sublinear full stop'.
+      "\xE0\xA5\xA4" . // 'Devanagari danda'.
+      "\xE1\x81\x8A" . // 'Myanmar sign little section'.
+      "\xE1\x81\x8B" . // 'Myanmar sign section'.
+      "\xE1\x8D\xA2" . // 'Ethiopic full stop'.
+      "\xE1\x8D\xA7" . // 'Ethiopic question mark'.
+      "\xE1\x8D\xA8" . // 'Ethiopic paragraph separator'.
+      "\xE1\x99\xAE" . // 'Canadian syllabics full stop'.
+      "\xE1\xA0\x83" . // 'Mongolian full stop'.
+      "\xE1\xA0\xA9" . // 'Mongolian manchu full stop'.
+      "\xE2\x80\xBC" . // 'Double exclamation mark'.
+      "\xE2\x80\xBD" . // 'Interrobang'.
+      "\xE2\x81\x87" . // 'Double question mark'.
+      "\xE2\x81\x88" . // 'Question exclamation mark'.
+      "\xE2\x81\x89" . // 'Exclamation question mark'.
+      "\xE3\x80\x82" . // 'Ideographic full stop'.
+      "\xEF\xB9\x92" . // 'Small full stop'.
+      "\xEF\xB9\x97" . // 'Small exclamation mark'.
+      "\xEF\xBC\x81" . // 'Fullwidth exclamation mark'.
+      "\xEF\xBC\x8E" . // 'Fullwidth full stop'.
+      "\xEF\xBC\x9E" . // 'Fullwidth question mark'.
+      "\xEF\xBD\xA1";  // 'Halfwidth ideographic full stop'.
+    // We split after Sentence_Terminal characters only if preceded by a Letter
+    // character and followed by a Separator character.
+    $sentence_splitter = '/(?<=\p{L}[' . $stops . '])(?=\p{Z})/u';
+    // If no suitable sentence break is found, we split before any Unicode
+    // Separator character.
+    $word_splitter = '/(?=\p{Z})/u';
+  }
+  if ($body->nodeType === XML_TEXT_NODE) {
+    $text_length = drupal_strlen($body->textContent);
+    if ($text_length <= $size) {
+      $size -= $text_length;
+      $summary->appendChild($doc->createTextNode($body->textContent));
+      return $size;
+    }
+    // We avoid breaking text nodes within code blocks.
+    if (in_array('code', $parents)) {
+      // Return zero to avoid adding subsequent text nodes.
+      return 0;
+    }
+    $sentences = preg_split($sentence_splitter, $body->textContent);
+    $text = '';
+    foreach ($sentences as $sentence) {
+      $sentence_length = drupal_strlen($sentence);
+      // Only add the sentence if it fits within the length limit.
+      if ($sentence_length > $size) {
+        break;
       }
+      $text .= $sentence;
+      $size -= $sentence_length;
     }
-
-    // If a break point was found in this group, slice and return the teaser.
-    if ($min_rpos !== $max_rpos) {
-      // Don't slice with length 0.  Length must be <0 to slice from RHS.
-      return ($min_rpos === 0) ? $teaser : substr($teaser, 0, 0 - $min_rpos);
+    // If no suitable sentence break was found, try to break between words.
+    if ($text === '') {
+      $words =  preg_split($word_splitter, $body->textContent);
+      foreach ($words as $word) {
+        $word_length = drupal_strlen($word);
+        // Only add the word if it fits within the length limit.
+        if ($word_length > $size) {
+          break;
+        }
+        $text .= $word;
+        $size -= $word_length;
+      }
     }
+    $summary->appendChild($doc->createTextNode($text));
+    // Return zero to avoid adding subsequent text nodes.
+    return 0;
   }
-
-  // If a break point was not found, still return a teaser.
-  return $teaser;
+  if ($body->hasChildNodes()) {
+    $node = $summary->appendChild($doc->createElement($body->tagName));
+    $parents[] = $body->tagName;
+    foreach ($body->childNodes as $child) {
+      if ($size > 0) {
+        $size = _text_summarize($child, $size, $node, $doc, $parents);
+      }
+      else {
+        break;
+      }
+    }
+  }
+  return $size;
 }
 
 /**
@@ -1269,10 +1362,10 @@ function node_search($op = 'search', $keys = NULL) {
         $join2 .= ' LEFT JOIN {node_counter} nc ON nc.nid = i.sid';
         $total += $weight;
       }
-      
-      // When all search factors are disabled (ie they have a weight of zero), 
-      // the default score is based only on keyword relevance and there is no need to 
-      // adjust the score of each item. 
+
+      // When all search factors are disabled (ie they have a weight of zero),
+      // the default score is based only on keyword relevance and there is no need to
+      // adjust the score of each item.
       if ($total == 0) {
         $select2 = 'i.relevance AS score';
         $total = 1;
@@ -1280,7 +1373,7 @@ function node_search($op = 'search', $keys = NULL) {
       else {
         $select2 = implode(' + ', $ranking) . ' AS score';
       }
-      
+
       // Do search.
       $find = do_search($keys, 'node', 'INNER JOIN {node} n ON n.nid = i.sid '. $join1, $conditions1 . (empty($where1) ? '' : ' AND '. $where1), $arguments1, $select2, $join2, $arguments2);
 
@@ -1708,7 +1801,7 @@ function node_feed($nids = FALSE, $channel = array()) {
         $item->body = $content;
         unset($item->teaser);
       }
-    
+
       // Allow modules to modify the fully-built node.
       node_invoke_nodeapi($item, 'alter', $teaser, FALSE);
     }
-- 
1.7.5.4