From 52e4140d6d19eb464d092d24bb8bf08cd997ae42 Mon Sep 17 00:00:00 2001 From: Bob Vincent Date: Tue, 6 Mar 2012 04:07:45 -0500 Subject: [PATCH] Issue #221257 by gpk, AlexisWilke, NancyDru, sun, pillarsdotnet, David_Rothstein, effulgentsia: text_summary() should output valid HTML and Unicode text. --- core/modules/field/modules/text/text.info | 1 + core/modules/field/modules/text/text.module | 252 ++++++++++++++++++--------- core/modules/field/modules/text/text.test | 174 +++++++++---------- 3 files changed, 255 insertions(+), 172 deletions(-) diff --git a/core/modules/field/modules/text/text.info b/core/modules/field/modules/text/text.info index b424d2d452f05bb23498dac2e1504826c59c7ae6..e3f045eea3a698f22e0602eb837afe698c4d6c6f 100644 --- a/core/modules/field/modules/text/text.info +++ b/core/modules/field/modules/text/text.info @@ -4,5 +4,6 @@ package = Core version = VERSION core = 8.x dependencies[] = field +dependencies[] = filter files[] = text.test required = TRUE diff --git a/core/modules/field/modules/text/text.module b/core/modules/field/modules/text/text.module index d73814faaafa007f2f13499616ecd17edcf24b7f..6ac43d3f7757d71767cf0358561bbb33d79eec6c 100644 --- a/core/modules/field/modules/text/text.module +++ b/core/modules/field/modules/text/text.module @@ -324,124 +324,216 @@ function _text_sanitize($instance, $langcode, $item, $column) { * * If the end of the summary is not indicated using the delimiter * then we generate the summary automatically, trying to end it at a sensible - * place such as the end of a paragraph, a line break, or the end of a - * sentence (in that order of preference). + * place such as the end of a paragraph, a line break, a sentence, or at a + * whitespace character (in that order of preference). * * @param $text * The content for which a summary will be generated. * @param $format - * The format of the content. - * If the PHP filter is present and $text contains PHP code, we do not - * split it up to prevent parse errors. - * If the line break filter is present then we treat newlines embedded in - * $text as line breaks. - * If the htmlcorrector filter is present, it will be run on the generated - * summary (if different from the incoming $text). + * The format of the content. This paramter is actually unnecessary since the + * format will have already been applied by _text_sanitize(). * @param $size - * The desired character length of the summary. If omitted, the default - * value will be used. Ignored if the special delimiter is present - * in $text. + * The desired character length of the summary, not counting HTML tags. If + * omitted, the default value will be used. Ignored if the special delimiter + * is present in $text. * @return * The generated summary. */ function text_summary($text, $format = NULL, $size = NULL) { - if (!isset($size)) { // What used to be called 'teaser' is now called 'summary', but // the variable 'teaser_length' is preserved for backwards compatibility. $size = variable_get('teaser_length', 600); } - // Find where the delimiter is in the body + // Find where the delimiter is in the body. Note the use of strpos() to count + // bytes rather than UTF-8 character sequences. $delimiter = strpos($text, ''); - // If the size is zero, and there is no delimiter, the entire body is the summary. - if ($size == 0 && $delimiter === FALSE) { + // If there is no delimiter and the size is either 0 or larger than the text + // length, then return the full text. + if ($delimiter === FALSE && ($size == 0 || drupal_strlen($text) <= $size)) { return $text; } + // DOM manipulations needed to ensure valid HTML after trimming text can be + // slow, so check the cache first. This function does not apply a filter + // format, but it's related enough to the filter system to use its cache bin. + $cache_id = "text_summary:$size:" . drupal_hash_base64($text); + if ($cached = cache_get($cache_id, 'cache_filter')) { + return $cached->data; + } + // If a valid delimiter has been specified, use it to chop off the summary. if ($delimiter !== FALSE) { - return substr($text, 0, $delimiter); - } + // Since there is no drupal_strpos(), we must use substr() instead of + // drupal_substr() here, or we'll break on UTF-8 input. + $output = substr($text, 0, $delimiter); - // We check for the presence of the PHP evaluator filter in the current - // format. If the body contains PHP code, we do not split it up to prevent - // parse errors. - if (isset($format)) { - $filters = filter_list_format($format); - if (isset($filters['php_code']) && $filters['php_code']->status && strpos($text, 'documentElement; - // If we have a short body, the entire body is the summary. - if (drupal_strlen($text) <= $size) { - return $text; - } - - // If the delimiter has not been specified, try to split at paragraph or - // sentence boundaries. - - // The summary may not be longer than maximum length specified. Initial slice. - $summary = truncate_utf8($text, $size); - - // Store the actual length of the UTF8 string -- which might not be the same - // as $size. - $max_rpos = strlen($summary); + // Generate a DOM Document to hold the summary. + $summary_doc = new DOMDocument(); - // How much to cut off the end of the summary so that it doesn't end in the - // middle of a paragraph, sentence, or word. - // Initialize it to maximum in order to find the minimum. - $min_rpos = $max_rpos; + // Recursively copy each child node from $body_node to $summary_doc + // until $size limit is reached. + _text_summarize($body_node, $size, $summary_doc, $summary_doc); - // Store the reverse of the summary. We use strpos on the reversed needle and - // haystack for speed and convenience. - $reversed = strrev($summary); - - // Build an array of arrays of break points grouped by preference. - $break_points = array(); - - // A paragraph near the end of sliced summary is most preferable. - $break_points[] = array('

' => 0); + // Convert the summary document back to XHTML. + $output = filter_dom_serialize($summary_doc); + } + // DOM automatically wraps plain-text in a

tag, but if the input was plain + // text, then the summary should be as well. - // If no complete paragraph then treat line breaks as paragraphs. - $line_breaks = array('
' => 6, '
' => 4); - // Newline only indicates a line break if line break converter - // filter is present. - if (isset($filters['filter_autop'])) { - $line_breaks["\n"] = 1; + if ($text === strip_tags($text)) { + $output = strip_tags($output); } - $break_points[] = $line_breaks; + $output = trim($output); - // If the first paragraph is too long, split at the end of a sentence. - $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1); + // Cache the result for later use. + cache_set($cache_id, $output, 'cache_filter'); + return $output; +} - // Iterate over the groups of break points until a break point is found. - foreach ($break_points as $points) { - // Look for each break point, starting at the end of the summary. - foreach ($points as $point => $offset) { - // The summary is already reversed, but the break point isn't. - $rpos = strpos($reversed, strrev($point)); - if ($rpos !== FALSE) { - $min_rpos = min($rpos + $offset, $min_rpos); +/** + * Helper function for text_summary. + * + * Recursively copies elements from $body to $summary, subtracting the length + * of the textContent portions from $size until $size reaches zero. + * + * @param $body + * The source DOMNode. + * @param $size + * The maximum number of textContent characters to copy. + * @param $summary + * The destination DOMNode. + * @param $doc + * The destination DOMDocument. Should be the same as the + * $summary->ownerDocument property. + * @param $parents + * An array of tag names of ancestor nodes. + * + * @return + * The number of additional characters left to copy. + */ +function _text_summarize($body, $size, $summary, $doc, $parents = array()) { + static $sentence_splitter; + static $word_splitter; + if (!isset($sentence_splitter)) { + // According to http://unicode.org/review/pr-23.html, these are the Unicode + // Sentence_Terminal characters. + $stops = + "\x21" . // 'Exclamation mark'. + "\x2E" . // 'Full stop'. + "\x3F" . // 'Question mark'. + "\xD6\x89" . // 'Armenian full stop'. + "\xD8\x9F" . // 'Arabic question mark'. + "\xDB\x94" . // 'Arabic full stop'. + "\xDC\x80" . // 'Syriac end of paragraph'. + "\xDC\x81" . // 'Syriac supralinear full stop'. + "\xDC\x82" . // 'Syriac sublinear full stop'. + "\xE0\xA5\xA4" . // 'Devanagari danda'. + "\xE1\x81\x8A" . // 'Myanmar sign little section'. + "\xE1\x81\x8B" . // 'Myanmar sign section'. + "\xE1\x8D\xA2" . // 'Ethiopic full stop'. + "\xE1\x8D\xA7" . // 'Ethiopic question mark'. + "\xE1\x8D\xA8" . // 'Ethiopic paragraph separator'. + "\xE1\x99\xAE" . // 'Canadian syllabics full stop'. + "\xE1\xA0\x83" . // 'Mongolian full stop'. + "\xE1\xA0\xA9" . // 'Mongolian manchu full stop'. + "\xE2\x80\xBC" . // 'Double exclamation mark'. + "\xE2\x80\xBD" . // 'Interrobang'. + "\xE2\x81\x87" . // 'Double question mark'. + "\xE2\x81\x88" . // 'Question exclamation mark'. + "\xE2\x81\x89" . // 'Exclamation question mark'. + "\xE3\x80\x82" . // 'Ideographic full stop'. + "\xEF\xB9\x92" . // 'Small full stop'. + "\xEF\xB9\x97" . // 'Small exclamation mark'. + "\xEF\xBC\x81" . // 'Fullwidth exclamation mark'. + "\xEF\xBC\x8E" . // 'Fullwidth full stop'. + "\xEF\xBC\x9E" . // 'Fullwidth question mark'. + "\xEF\xBD\xA1"; // 'Halfwidth ideographic full stop'. + // We split after Sentence_Terminal characters only if preceded by a Letter + // character and followed by a Separator character. + $sentence_splitter = '/(?<=\p{L}[' . $stops . '])(?=\p{Z})/u'; + // If no suitable sentence break is found, we split before any Unicode + // separator character. + $word_splitter = '/(?=\p{Z})/u'; + } + if ($body->nodeType === XML_TEXT_NODE) { + $text_length = drupal_strlen($body->textContent); + if ($text_length <= $size) { + $size -= $text_length; + $summary->appendChild($doc->createTextNode($body->textContent)); + return $size; + } + // We avoid breaking text nodes within code blocks. + if (in_array('code', $parents)) { + return 0; + } + $sentences = preg_split($sentence_splitter, $body->textContent); + $text = ''; + foreach ($sentences as $sentence) { + $sentence_length = drupal_strlen($sentence); + // Only add the sentence if it fits within the length limit. + if ($sentence_length > $size) { + break; } + $text .= $sentence; + $size -= $sentence_length; } - - // If a break point was found in this group, slice and stop searching. - if ($min_rpos !== $max_rpos) { - // Don't slice with length 0. Length must be <0 to slice from RHS. - $summary = ($min_rpos === 0) ? $summary : substr($summary, 0, 0 - $min_rpos); - break; + // If no suitable sentence break was found, try to break between words. + if ($text === '') { + $words = preg_split($word_splitter, $body->textContent); + foreach ($words as $word) { + $word_length = drupal_strlen($word); + // Only add the word if it fits within the length limit. + if ($word_length > $size) { + break; + } + $text .= $word; + $size -= $word_length; + } } + // Append the complete sentences or words, and return 0 to indicate + // completion. + $summary->appendChild($doc->createTextNode($text)); + return 0; } - // If the htmlcorrector filter is present, apply it to the generated summary. - if (isset($filters['filter_htmlcorrector'])) { - $summary = _filter_htmlcorrector($summary); + // Recurse the DOM until the summary reaches the text length limit. + if ($body->nodeType === XML_ELEMENT_NODE) { + $node = $summary->appendChild($doc->createElement($body->tagName)); + $parents[] = $body->tagName; + if ($body->hasAttributes()) { + foreach ($body->attributes as $attributeNode) { + $node->setAttribute($attributeNode->nodeName, $attributeNode->value); + } + } + if ($body->hasChildNodes()) { + foreach ($body->childNodes as $child) { + if ($size > 0) { + $size = _text_summarize($child, $size, $node, $doc, $parents); + } + else { + break; + } + } + } } - - return $summary; + return $size; } /** diff --git a/core/modules/field/modules/text/text.test b/core/modules/field/modules/text/text.test index 2d936be773ffbc2621f9d9405454dd41294d7cc7..c17f163e64f8a1a8ddfebdbad9b1f53a1ecb81b6 100644 --- a/core/modules/field/modules/text/text.test +++ b/core/modules/field/modules/text/text.test @@ -258,7 +258,8 @@ class TextSummaryTestCase extends DrupalWebTestCase { */ function testFirstSentenceQuestion() { $text = 'A question? A sentence. Another sentence.'; - $expected = 'A question? A sentence.'; + // The default format includes the auto-paragraph filter. + $expected = '

A question? A sentence.

'; $this->callTextSummary($text, $expected, NULL, 30); } @@ -270,11 +271,53 @@ class TextSummaryTestCase extends DrupalWebTestCase { 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . // 108 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. ' . // 103 'Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'; // 110 - $expected = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' . + $expected = '

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' . 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . - 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.'; - // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word. - $this->callTextSummary($text, $expected, NULL, 340); + 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.

'; + // Test that sentence splitting works when we replace the full stops with + // any other Unicode Sentence_Terminal character listed at + // http://unicode.org/review/pr-23.html. + $stops = array( + "\x21", // 'Exclamation mark'. + "\x2E", // 'Full stop'. + "\x3F", // 'Question mark'. + "\xD6\x89", // 'Armenian full stop'. + "\xD8\x9F", // 'Arabic question mark'. + "\xDB\x94", // 'Arabic full stop'. + "\xDC\x80", // 'Syriac end of paragraph'. + "\xDC\x81", // 'Syriac supralinear full stop'. + "\xDC\x82", // 'Syriac sublinear full stop'. + "\xE0\xA5\xA4", // 'Devanagari danda'. + "\xE1\x81\x8A", // 'Myanmar sign little section'. + "\xE1\x81\x8B", // 'Myanmar sign section'. + "\xE1\x8D\xA2", // 'Ethiopic full stop'. + "\xE1\x8D\xA7", // 'Ethiopic question mark'. + "\xE1\x8D\xA8", // 'Ethiopic paragraph separator'. + "\xE1\x99\xAE", // 'Canadian syllabics full stop'. + "\xE1\xA0\x83", // 'Mongolian full stop'. + "\xE1\xA0\xA9", // 'Mongolian manchu full stop'. + "\xE2\x80\xBC", // 'Double exclamation mark'. + "\xE2\x80\xBD", // 'Interrobang'. + "\xE2\x81\x87", // 'Double question mark'. + "\xE2\x81\x88", // 'Question exclamation mark'. + "\xE2\x81\x89", // 'Exclamation question mark'. + "\xE3\x80\x82", // 'Ideographic full stop'. + "\xEF\xB9\x92", // 'Small full stop'. + "\xEF\xB9\x97", // 'Small exclamation mark'. + "\xEF\xBC\x81", // 'Fullwidth exclamation mark'. + "\xEF\xBC\x8E", // 'Fullwidth full stop'. + "\xEF\xBC\x9E", // 'Fullwidth question mark'. + "\xEF\xBD\xA1", // 'Halfwidth ideographic full stop'. + ); + foreach ($stops as $stop) { + // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word. + $this->callTextSummary( + str_replace('.', $stop, $text), + str_replace('.', $stop, $expected), + NULL, + 340 + ); + } } /** @@ -286,104 +329,51 @@ class TextSummaryTestCase extends DrupalWebTestCase { // The summaries we expect text_summary() to return when $size is the index // of each array item. - // Using no text format: - $expected = array( - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "<", - "", - "

\n", - "

\nH", - "

\nHi", - "

\nHi\n", - "

\nHi\n<", - "

\nHi\n\nHi\n\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - ); - - // And using a text format WITH the line-break and htmlcorrector filters. - $expected_lb = array( - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "", - "

", - "

", - "

", - "

", - "

", - "

\nHi

", - "

\nHi

", - "

\nHi

", - "

\nHi

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", + // Using filtered_html format: + $expected = array ( + 0 => "

Hi

\n

folks

\n

!

", + 1 => "

", + 2 => "

Hi

", + 3 => "

Hi

", + 4 => "

Hi

\n

", + 5 => "

Hi

\n

", + 6 => "

Hi

\n

", + 7 => "

Hi

\n

", + 8 => "

Hi

\n

folks

", + 9 => "

Hi

\n

folks

", + 10 => "

Hi

\n

folks

\n

!

", ); // Test text_summary() for different sizes. - for ($i = 0; $i <= 37; $i++) { - $this->callTextSummary($text, $expected[$i], NULL, $i); - $this->callTextSummary($text, $expected_lb[$i], 'plain_text', $i); - $this->callTextSummary($text, $expected_lb[$i], 'filtered_html', $i); + for ($i = 0; $i <= 10; $i++) { + $this->callTextSummary($text, $expected[$i], 'filtered_html', $i); } } /** + * Test that we avoid breaking text in the middle of a CODE block. + */ + function testCode() { + $text = 'This is an example code block:' + . '$example = "Sentence one. Sentence two. Sentence three.'; + $expected = '

This is an example code block:

'; + $this->callTextSummary($text, $expected, 'full_html', 65); + } + + /** * Calls text_summary() and asserts that the expected teaser is returned. */ function callTextSummary($text, $expected, $format = NULL, $size = NULL) { $summary = text_summary($text, $format, $size); - $this->assertIdentical($summary, $expected, t('Generated summary "@summary" matches expected "@expected".', array('@summary' => $summary, '@expected' => $expected))); + $replacements = array( + '@summary' => '"' . str_replace("\n", '\n', $summary) . '"', + '@expected' => '"' . str_replace("\n", '\n', $expected) . '"', + ); + $comment = t( + 'Generated summary @summary matches expected @expected.', + $replacements + ); + $this->assertIdentical($summary, $expected, $comment); } /** -- 1.7.5.4