From 6b1981e275412aea0d3deaec7418d04070c39ed7 Mon Sep 17 00:00:00 2001 From: Bob Vincent Date: Mon, 16 May 2011 10:05:03 -0400 Subject: [PATCH] Issue #221257 by pillarsdotnet: text_summary() should be HTML-aware. --- modules/field/modules/text/text.info | 1 + modules/field/modules/text/text.module | 217 ++++++++++++++++++++------------ modules/field/modules/text/text.test | 142 ++++++++------------- 3 files changed, 189 insertions(+), 171 deletions(-) diff --git a/modules/field/modules/text/text.info b/modules/field/modules/text/text.info index b424d2d452f05bb23498dac2e1504826c59c7ae6..e3f045eea3a698f22e0602eb837afe698c4d6c6f 100644 --- a/modules/field/modules/text/text.info +++ b/modules/field/modules/text/text.info @@ -4,5 +4,6 @@ package = Core version = VERSION core = 8.x dependencies[] = field +dependencies[] = filter files[] = text.test required = TRUE diff --git a/modules/field/modules/text/text.module b/modules/field/modules/text/text.module index 89c605cf2c046eb48d448e20d27a63ef88a8583f..cf3ba27cf755d715ba27a25b3c43a9a933f59783 100644 --- a/modules/field/modules/text/text.module +++ b/modules/field/modules/text/text.module @@ -330,49 +330,38 @@ function _text_sanitize($instance, $langcode, $item, $column) { * @param $text * The content for which a summary will be generated. * @param $format - * The format of the content. - * If the PHP filter is present and $text contains PHP code, we do not - * split it up to prevent parse errors. - * If the line break filter is present then we treat newlines embedded in - * $text as line breaks. - * If the htmlcorrector filter is present, it will be run on the generated - * summary (if different from the incoming $text). + * The format of the content. The $text string will be passed through + * check_markup() before generating a summary. * @param $size - * The desired character length of the summary. If omitted, the default - * value will be used. Ignored if the special delimiter is present - * in $text. + * The desired character length of the summary, not counting HTML tags. If + * omitted, the default value will be used. Ignored if the special delimiter + * is present in $text. * @return * The generated summary. */ function text_summary($text, $format = NULL, $size = NULL) { - - if (!isset($size)) { - // What used to be called 'teaser' is now called 'summary', but - // the variable 'teaser_length' is preserved for backwards compatibility. - $size = variable_get('teaser_length', 600); - } - // Find where the delimiter is in the body $delimiter = strpos($text, ''); - // If the size is zero, and there is no delimiter, the entire body is the summary. - if ($size == 0 && $delimiter === FALSE) { - return $text; - } - // If a valid delimiter has been specified, use it to chop off the summary. if ($delimiter !== FALSE) { - return substr($text, 0, $delimiter); + // Since there is no drupal_strpos(), we must use substr() instead of + // drupal_substr() here, or we'll break on UTF-8 input. + return trim(check_markup(substr($text, 0, $delimiter), $format)); } - // We check for the presence of the PHP evaluator filter in the current - // format. If the body contains PHP code, we do not split it up to prevent - // parse errors. - if (isset($format)) { - $filters = filter_list_format($format); - if (isset($filters['php_code']) && $filters['php_code']->status && strpos($text, 'documentElement; - // The summary may not be longer than maximum length specified. Initial slice. - $summary = truncate_utf8($text, $size); + // Generate a DOM Document to hold the summary. + $summary_doc = new DOMDocument(); - // Store the actual length of the UTF8 string -- which might not be the same - // as $size. - $max_rpos = strlen($summary); + // Recursively copy each child node from $body_node to $summary_doc + // until $size limit is reached. + _text_summarize($body_node, $size, $summary_doc, $summary_doc); - // How much to cut off the end of the summary so that it doesn't end in the - // middle of a paragraph, sentence, or word. - // Initialize it to maximum in order to find the minimum. - $min_rpos = $max_rpos; + // Convert the summary document back to XHTML. + $output = filter_dom_serialize($summary_doc); - // Store the reverse of the summary. We use strpos on the reversed needle and - // haystack for speed and convenience. - $reversed = strrev($summary); - - // Build an array of arrays of break points grouped by preference. - $break_points = array(); - - // A paragraph near the end of sliced summary is most preferable. - $break_points[] = array('

' => 0); - - // If no complete paragraph then treat line breaks as paragraphs. - $line_breaks = array('
' => 6, '
' => 4); - // Newline only indicates a line break if line break converter - // filter is present. - if (isset($filters['filter_autop'])) { - $line_breaks["\n"] = 1; + // DOM automatically wraps plain-text in a

tag, but if the original + // formatted version was plaintext, then the summary should be plaintext + // also. + if ($text === strip_tags($text)) { + $output = strip_tags($output); } - $break_points[] = $line_breaks; - - // If the first paragraph is too long, split at the end of a sentence. - $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1); + return trim($output); +} - // Iterate over the groups of break points until a break point is found. - foreach ($break_points as $points) { - // Look for each break point, starting at the end of the summary. - foreach ($points as $point => $offset) { - // The summary is already reversed, but the break point isn't. - $rpos = strpos($reversed, strrev($point)); - if ($rpos !== FALSE) { - $min_rpos = min($rpos + $offset, $min_rpos); - } +/** + * Helper function for text_summary. + * + * Recursively copies elements from $body to $summary, subtracting the length + * of the textContent portions from $size until $size reaches zero. + * + * @param $body + * The source DOMNode. + * @param $size + * The maximum number of textContent characters to copy. + * @param $summary + * The destination DOMNode. + * @param $doc + * The destination DOMDocument. Should be the same as the + * $summary->ownerDocument property. + * @param $parents + * An array of tag names of ancestor nodes. + * + * @return + * The number of additional characters left to copy. + */ +function _text_summarize($body, $size, $summary, $doc, $parents = array()) { + static $sentence_splitter; + if (!isset($sentence_splitter)) { + // According to http://unicode.org/review/pr-23.html, + // these are the Unicode Sentence_Terminal characters. + $stops = + '!' . // Exclamation mark. + '.' . // Full stop. + '?' . // Question mark. + '?' . // Armenian full stop. + '?' . // Arabic question mark. + '?' . // Arabic full stop. + '?' . // Syriac end of paragraph. + '?' . // Syriac supralinear full stop. + '?' . // Syriac sublinear full stop. + '?' . // Devanagari danda. + '?' . // Myanmar sign little section. + '?' . // Myanmar sign section. + '?' . // Ethiopic full stop. + '?' . // Ethiopic question mark. + '?' . // Ethiopic paragraph separator. + '?' . // Canadian syllabics full stop. + '?' . // Mongolian full stop. + '?' . // Mongolian manchu full stop. + '?' . // Double exclamation mark. + '?' . // Interrobang. + '?' . // Double question mark. + '?' . // Question exclamation mark. + '?' . // Exclamation question mark. + '?' . // Ideographic full stop. + '?' . // Small full stop. + '?' . // Small exclamation mark. + '?' . // Fullwidth exclamation mark. + '?' . // Fullwidth full stop. + '?' . // Fullwidth question mark. + '?'; // Halfwidth ideographic full stop. + // We split after Sentence_Terminal characters only if preceded by a Letter + // character and followed by a Separator character. + $sentence_splitter = '/(?<=\p{L}[' . $stops . '])(?=\p{Z})/u'; + } + if ($body->nodeType === XML_TEXT_NODE) { + $text_length = drupal_strlen($body->textContent); + if ($text_length <= $size) { + $size -= $text_length; + $summary->appendChild($doc->createTextNode($body->textContent)); + return $size; } - - // If a break point was found in this group, slice and stop searching. - if ($min_rpos !== $max_rpos) { - // Don't slice with length 0. Length must be <0 to slice from RHS. - $summary = ($min_rpos === 0) ? $summary : substr($summary, 0, 0 - $min_rpos); - break; + // We avoid breaking text nodes within CODE blocks. + if (in_array('CODE', $parents)) { + // Return zero to avoid adding subsequent text nodes. + return 0; } + $sentences = preg_split($sentence_splitter, $body->textContent); + $text = ''; + foreach ($sentences as $sentence) { + $sentence_length = drupal_strlen($sentence); + // Only add the sentence if it fits within the length limit. + if ($sentence_length > $size) { + break; + } + $text .= $sentence; + $size -= $sentence_length; + } + $summary->appendChild($doc->createTextNode($text)); + // Return zero to avoid adding subsequent text nodes. + return 0; } - - // If the htmlcorrector filter is present, apply it to the generated summary. - if (isset($filters['filter_htmlcorrector'])) { - $summary = _filter_htmlcorrector($summary); + if ($body->hasChildNodes()) { + $node = $summary->appendChild($doc->createElement($body->tagName)); + $parents[] = drupal_strtoupper($body->tagName); + foreach ($body->childNodes as $child) { + if ($size > 0) { + $size = _text_summarize($child, $size, $node, $doc); + } + else { + break; + } + } } - - return $summary; + return $size; } /** diff --git a/modules/field/modules/text/text.test b/modules/field/modules/text/text.test index b42fed7e09894d352b3ecdd27431234e6ae77f72..ff3ff044c30e12bb44073e80bde98a33ef8b2be9 100644 --- a/modules/field/modules/text/text.test +++ b/modules/field/modules/text/text.test @@ -258,7 +258,8 @@ class TextSummaryTestCase extends DrupalWebTestCase { */ function testFirstSentenceQuestion() { $text = 'A question? A sentence. Another sentence.'; - $expected = 'A question? A sentence.'; + // The default format includes the auto-paragraph filter. + $expected = '

A question? A sentence.

'; $this->callTextSummary($text, $expected, NULL, 30); } @@ -270,9 +271,9 @@ class TextSummaryTestCase extends DrupalWebTestCase { 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . // 108 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. ' . // 103 'Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'; // 110 - $expected = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' . + $expected = '

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' . 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . - 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.'; + 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.

'; // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word. $this->callTextSummary($text, $expected, NULL, 340); } @@ -286,95 +287,52 @@ class TextSummaryTestCase extends DrupalWebTestCase { // The summaries we expect text_summary() to return when $size is the index // of each array item. - // Using no text format: - $expected = array( - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "<", - "", - "

\n", - "

\nH", - "

\nHi", - "

\nHi\n", - "

\nHi\n<", - "

\nHi\n\nHi\n\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - ); - // And using a text format WITH the line-break and htmlcorrector filters. - $expected_lb = array( - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "", - "

", - "

", - "

", - "

", - "

", - "

\nHi

", - "

\nHi

", - "

\nHi

", - "

\nHi

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", - "

\nHi\n

\n

\nfolks\n
\n!\n

", + // Using filtered_html format: + $expected = array ( + 0 => "

Hi

\n

folks

\n

!

", + 1 => "

", + 2 => "

Hi

", + 3 => "

Hi

", + 4 => "

Hi

\n

", + 5 => "

Hi

\n

", + 6 => "

Hi

\n

", + 7 => "

Hi

\n

", + 8 => "

Hi

\n

folks

", + 9 => "

Hi

\n

folks

", + 10 => "

Hi

\n

folks

\n

!

", + 11 => "

Hi

\n

folks

\n

!

", + 12 => "

Hi

\n

folks

\n

!

", + 13 => "

Hi

\n

folks

\n

!

", + 14 => "

Hi

\n

folks

\n

!

", + 15 => "

Hi

\n

folks

\n

!

", + 16 => "

Hi

\n

folks

\n

!

", + 17 => "

Hi

\n

folks

\n

!

", + 18 => "

Hi

\n

folks

\n

!

", + 19 => "

Hi

\n

folks

\n

!

", + 20 => "

Hi

\n

folks

\n

!

", + 21 => "

Hi

\n

folks

\n

!

", + 22 => "

Hi

\n

folks

\n

!

", + 23 => "

Hi

\n

folks

\n

!

", + 24 => "

Hi

\n

folks

\n

!

", + 25 => "

Hi

\n

folks

\n

!

", + 26 => "

Hi

\n

folks

\n

!

", + 27 => "

Hi

\n

folks

\n

!

", + 28 => "

Hi

\n

folks

\n

!

", + 29 => "

Hi

\n

folks

\n

!

", + 30 => "

Hi

\n

folks

\n

!

", + 31 => "

Hi

\n

folks

\n

!

", + 32 => "

Hi

\n

folks

\n

!

", + 33 => "

Hi

\n

folks

\n

!

", + 34 => "

Hi

\n

folks

\n

!

", + 35 => "

Hi

\n

folks

\n

!

", + 36 => "

Hi

\n

folks

\n

!

", + 37 => "

Hi

\n

folks

\n

!

", ); // Test text_summary() for different sizes. for ($i = 0; $i <= 37; $i++) { - $this->callTextSummary($text, $expected[$i], NULL, $i); - $this->callTextSummary($text, $expected_lb[$i], 'plain_text', $i); - $this->callTextSummary($text, $expected_lb[$i], 'filtered_html', $i); + $this->callTextSummary($text, $expected[$i], 'filtered_html', $i); } } @@ -383,7 +341,15 @@ class TextSummaryTestCase extends DrupalWebTestCase { */ function callTextSummary($text, $expected, $format = NULL, $size = NULL) { $summary = text_summary($text, $format, $size); - $this->assertIdentical($summary, $expected, t('Generated summary "@summary" matches expected "@expected".', array('@summary' => $summary, '@expected' => $expected))); + $replacements = array( + '@summary' => '"' . str_replace("\n", '\n', $summary) . '"', + '@expected' => '"' . str_replace("\n", '\n', $expected) . '"', + ); + $comment = t( + 'Generated summary @summary matches expected @expected.', + $replacements + ); + $this->assertIdentical($summary, $expected, $comment); } /** -- 1.7.4.1