From 0c75b075642fd808df96d092b8792b61bf0a23d1 Mon Sep 17 00:00:00 2001 From: Bob Vincent Date: Wed, 28 Sep 2011 17:17:01 -0400 Subject: [PATCH] Issue #221257 by pillarsdotnet: text_summary() should output valid HTML and Unicode text. --- modules/node/node.module | 253 +++++++++++++++++++++++++++++++--------------- 1 files changed, 173 insertions(+), 80 deletions(-) diff --git a/modules/node/node.module b/modules/node/node.module index 299dfc11daed85998fcd0764381cf7cf33065d62..e72ba38d7a13bec6aaaaa54408f164f396c05326 100644 --- a/modules/node/node.module +++ b/modules/node/node.module @@ -287,113 +287,206 @@ function node_teaser_include_verify(&$form, &$form_state) { * * If the end of the teaser is not indicated using the delimiter * then we generate the teaser automatically, trying to end it at a sensible - * place such as the end of a paragraph, a line break, or the end of a - * sentence (in that order of preference). + * place such as the end of a paragraph, a line break, a sentence, or at a + * whitespace character (in that order of preference). * * @param $body * The content for which a teaser will be generated. * @param $format - * The format of the content. If the content contains PHP code, we do not - * split it up to prevent parse errors. If the line break filter is present - * then we treat newlines embedded in $body as line breaks. + * The format of the content. The $text string will be passed through + * check_markup() before generating a summary. * @param $size - * The desired character length of the teaser. If omitted, the default - * value will be used. Ignored if the special delimiter is present - * in $body. + * The desired character length of the summary, not counting HTML tags. If + * omitted, the default value will be used. Ignored if the special delimiter + * is present in $text. * @return * The generated teaser. */ function node_teaser($body, $format = NULL, $size = NULL) { - - if (!isset($size)) { - $size = variable_get('teaser_length', 600); + // Replace NULL format with FILTER_FORMAT_DEFAULT. + if (!isset($format)) { + $format = FILTER_FORMAT_DEFAULT; } // Find where the delimiter is in the body $delimiter = strpos($body, ''); - // If the size is zero, and there is no delimiter, the entire body is the teaser. - if ($size == 0 && $delimiter === FALSE) { - return $body; - } - - // If a valid delimiter has been specified, use it to chop off the teaser. + // If a valid delimiter has been specified, use it to chop off the summary. if ($delimiter !== FALSE) { - return substr($body, 0, $delimiter); + // Since there is no drupal_strpos(), we must use substr() instead of + // drupal_substr() here, or we'll break on UTF-8 input. + return trim(check_markup(substr($body, 0, $delimiter), $format, FALSE)); } - // We check for the presence of the PHP evaluator filter in the current - // format. If the body contains PHP code, we do not split it up to prevent - // parse errors. - if (isset($format)) { - $filters = filter_list_format($format); - if (isset($filters['php/0']) && strpos($body, 'loadHTML('' . $body . ''); + $body_node = $body_doc->getElementsByTagName('body')->item(0); - // The teaser may not be longer than maximum length specified. Initial slice. - $teaser = truncate_utf8($body, $size); + // Generate a DOM Document to hold the summary. + $summary_doc = new DOMDocument(); - // Store the actual length of the UTF8 string -- which might not be the same - // as $size. - $max_rpos = strlen($teaser); + // Recursively copy each child node from $body_node to $summary_doc + // until $size limit is reached. + _text_summarize($body_node, $size, $summary_doc, $summary_doc); - // How much to cut off the end of the teaser so that it doesn't end in the - // middle of a paragraph, sentence, or word. - // Initialize it to maximum in order to find the minimum. - $min_rpos = $max_rpos; - - // Store the reverse of the teaser. We use strpos on the reversed needle and - // haystack for speed and convenience. - $reversed = strrev($teaser); - - // Build an array of arrays of break points grouped by preference. - $break_points = array(); - - // A paragraph near the end of sliced teaser is most preferable. - $break_points[] = array('

' => 0); - - // If no complete paragraph then treat line breaks as paragraphs. - $line_breaks = array('
' => 6, '
' => 4); - // Newline only indicates a line break if line break converter - // filter is present. - if (isset($filters['filter/1'])) { - $line_breaks["\n"] = 1; + // Convert the summary document back to XHTML. Note that this version lacks + // the CDATA escaping functionality of the D7 filter_dom_serialize() function. + $body_node = $summary_doc->getElementsByTagName('body')->item(0); + $output = ''; + foreach ($body_node->childNodes as $child_node) { + $output .= $summary_doc->saveXML($child_node); } - $break_points[] = $line_breaks; + $output = preg_replace('|<([^> ]*)/>|', '<$1 />', $output); - // If the first paragraph is too long, split at the end of a sentence. - $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1); + // DOM automatically wraps plain-text in a

tag, but if the original + // formatted version was plaintext, then the summary should be plaintext + // also. + if ($body === strip_tags($body)) { + $output = strip_tags($output); + } + return trim($output). "\r\n\r\n"; +} - // Iterate over the groups of break points until a break point is found. - foreach ($break_points as $points) { - // Look for each break point, starting at the end of the teaser. - foreach ($points as $point => $offset) { - // The teaser is already reversed, but the break point isn't. - $rpos = strpos($reversed, strrev($point)); - if ($rpos !== FALSE) { - $min_rpos = min($rpos + $offset, $min_rpos); +/** + * Helper function for text_summary. + * + * Recursively copies elements from $body to $summary, subtracting the length + * of the textContent portions from $size until $size reaches zero. + * + * @param $body + * The source DOMNode. + * @param $size + * The maximum number of textContent characters to copy. + * @param $summary + * The destination DOMNode. + * @param $doc + * The destination DOMDocument. Should be the same as the + * $summary->ownerDocument property. + * @param $parents + * An array of tag names of ancestor nodes. + * + * @return + * The number of additional characters left to copy. + */ +function _text_summarize($body, $size, $summary, $doc, $parents = array()) { + static $sentence_splitter; + static $word_splitter; + if (!isset($sentence_splitter)) { + // According to http://unicode.org/review/pr-23.html, these are the Unicode + // Sentence_Terminal characters. + $stops = + "\x21" . // 'Exclamation mark'. + "\x2E" . // 'Full stop'. + "\x3F" . // 'Question mark'. + "\xD6\x89" . // 'Armenian full stop'. + "\xD8\x9F" . // 'Arabic question mark'. + "\xDB\x94" . // 'Arabic full stop'. + "\xDC\x80" . // 'Syriac end of paragraph'. + "\xDC\x81" . // 'Syriac supralinear full stop'. + "\xDC\x82" . // 'Syriac sublinear full stop'. + "\xE0\xA5\xA4" . // 'Devanagari danda'. + "\xE1\x81\x8A" . // 'Myanmar sign little section'. + "\xE1\x81\x8B" . // 'Myanmar sign section'. + "\xE1\x8D\xA2" . // 'Ethiopic full stop'. + "\xE1\x8D\xA7" . // 'Ethiopic question mark'. + "\xE1\x8D\xA8" . // 'Ethiopic paragraph separator'. + "\xE1\x99\xAE" . // 'Canadian syllabics full stop'. + "\xE1\xA0\x83" . // 'Mongolian full stop'. + "\xE1\xA0\xA9" . // 'Mongolian manchu full stop'. + "\xE2\x80\xBC" . // 'Double exclamation mark'. + "\xE2\x80\xBD" . // 'Interrobang'. + "\xE2\x81\x87" . // 'Double question mark'. + "\xE2\x81\x88" . // 'Question exclamation mark'. + "\xE2\x81\x89" . // 'Exclamation question mark'. + "\xE3\x80\x82" . // 'Ideographic full stop'. + "\xEF\xB9\x92" . // 'Small full stop'. + "\xEF\xB9\x97" . // 'Small exclamation mark'. + "\xEF\xBC\x81" . // 'Fullwidth exclamation mark'. + "\xEF\xBC\x8E" . // 'Fullwidth full stop'. + "\xEF\xBC\x9E" . // 'Fullwidth question mark'. + "\xEF\xBD\xA1"; // 'Halfwidth ideographic full stop'. + // We split after Sentence_Terminal characters only if preceded by a Letter + // character and followed by a Separator character. + $sentence_splitter = '/(?<=\p{L}[' . $stops . '])(?=\p{Z})/u'; + // If no suitable sentence break is found, we split before any Unicode + // Separator character. + $word_splitter = '/(?=\p{Z})/u'; + } + if ($body->nodeType === XML_TEXT_NODE) { + $text_length = drupal_strlen($body->textContent); + if ($text_length <= $size) { + $size -= $text_length; + $summary->appendChild($doc->createTextNode($body->textContent)); + return $size; + } + // We avoid breaking text nodes within code blocks. + if (in_array('code', $parents)) { + // Return zero to avoid adding subsequent text nodes. + return 0; + } + $sentences = preg_split($sentence_splitter, $body->textContent); + $text = ''; + foreach ($sentences as $sentence) { + $sentence_length = drupal_strlen($sentence); + // Only add the sentence if it fits within the length limit. + if ($sentence_length > $size) { + break; } + $text .= $sentence; + $size -= $sentence_length; } - - // If a break point was found in this group, slice and return the teaser. - if ($min_rpos !== $max_rpos) { - // Don't slice with length 0. Length must be <0 to slice from RHS. - return ($min_rpos === 0) ? $teaser : substr($teaser, 0, 0 - $min_rpos); + // If no suitable sentence break was found, try to break between words. + if ($text === '') { + $words = preg_split($word_splitter, $body->textContent); + foreach ($words as $word) { + $word_length = drupal_strlen($word); + // Only add the word if it fits within the length limit. + if ($word_length > $size) { + break; + } + $text .= $word; + $size -= $word_length; + } } + $summary->appendChild($doc->createTextNode($text)); + // Return zero to avoid adding subsequent text nodes. + return 0; } - - // If a break point was not found, still return a teaser. - return $teaser; + if ($body->hasChildNodes()) { + $node = $summary->appendChild($doc->createElement($body->tagName)); + $parents[] = $body->tagName; + foreach ($body->childNodes as $child) { + if ($size > 0) { + $size = _text_summarize($child, $size, $node, $doc, $parents); + } + else { + break; + } + } + } + return $size; } /** @@ -1269,10 +1362,10 @@ function node_search($op = 'search', $keys = NULL) { $join2 .= ' LEFT JOIN {node_counter} nc ON nc.nid = i.sid'; $total += $weight; } - - // When all search factors are disabled (ie they have a weight of zero), - // the default score is based only on keyword relevance and there is no need to - // adjust the score of each item. + + // When all search factors are disabled (ie they have a weight of zero), + // the default score is based only on keyword relevance and there is no need to + // adjust the score of each item. if ($total == 0) { $select2 = 'i.relevance AS score'; $total = 1; @@ -1280,7 +1373,7 @@ function node_search($op = 'search', $keys = NULL) { else { $select2 = implode(' + ', $ranking) . ' AS score'; } - + // Do search. $find = do_search($keys, 'node', 'INNER JOIN {node} n ON n.nid = i.sid '. $join1, $conditions1 . (empty($where1) ? '' : ' AND '. $where1), $arguments1, $select2, $join2, $arguments2); @@ -1708,7 +1801,7 @@ function node_feed($nids = FALSE, $channel = array()) { $item->body = $content; unset($item->teaser); } - + // Allow modules to modify the fully-built node. node_invoke_nodeapi($item, 'alter', $teaser, FALSE); } -- 1.7.5.4