From 52e4140d6d19eb464d092d24bb8bf08cd997ae42 Mon Sep 17 00:00:00 2001
From: Bob Vincent
tag, but if the input was plain
+ // text, then the summary should be as well.
- // If no complete paragraph then treat line breaks as paragraphs.
- $line_breaks = array('
' => 6, '
' => 4);
- // Newline only indicates a line break if line break converter
- // filter is present.
- if (isset($filters['filter_autop'])) {
- $line_breaks["\n"] = 1;
+ if ($text === strip_tags($text)) {
+ $output = strip_tags($output);
}
- $break_points[] = $line_breaks;
+ $output = trim($output);
- // If the first paragraph is too long, split at the end of a sentence.
- $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1);
+ // Cache the result for later use.
+ cache_set($cache_id, $output, 'cache_filter');
+ return $output;
+}
- // Iterate over the groups of break points until a break point is found.
- foreach ($break_points as $points) {
- // Look for each break point, starting at the end of the summary.
- foreach ($points as $point => $offset) {
- // The summary is already reversed, but the break point isn't.
- $rpos = strpos($reversed, strrev($point));
- if ($rpos !== FALSE) {
- $min_rpos = min($rpos + $offset, $min_rpos);
+/**
+ * Helper function for text_summary.
+ *
+ * Recursively copies elements from $body to $summary, subtracting the length
+ * of the textContent portions from $size until $size reaches zero.
+ *
+ * @param $body
+ * The source DOMNode.
+ * @param $size
+ * The maximum number of textContent characters to copy.
+ * @param $summary
+ * The destination DOMNode.
+ * @param $doc
+ * The destination DOMDocument. Should be the same as the
+ * $summary->ownerDocument property.
+ * @param $parents
+ * An array of tag names of ancestor nodes.
+ *
+ * @return
+ * The number of additional characters left to copy.
+ */
+function _text_summarize($body, $size, $summary, $doc, $parents = array()) {
+ static $sentence_splitter;
+ static $word_splitter;
+ if (!isset($sentence_splitter)) {
+ // According to http://unicode.org/review/pr-23.html, these are the Unicode
+ // Sentence_Terminal characters.
+ $stops =
+ "\x21" . // 'Exclamation mark'.
+ "\x2E" . // 'Full stop'.
+ "\x3F" . // 'Question mark'.
+ "\xD6\x89" . // 'Armenian full stop'.
+ "\xD8\x9F" . // 'Arabic question mark'.
+ "\xDB\x94" . // 'Arabic full stop'.
+ "\xDC\x80" . // 'Syriac end of paragraph'.
+ "\xDC\x81" . // 'Syriac supralinear full stop'.
+ "\xDC\x82" . // 'Syriac sublinear full stop'.
+ "\xE0\xA5\xA4" . // 'Devanagari danda'.
+ "\xE1\x81\x8A" . // 'Myanmar sign little section'.
+ "\xE1\x81\x8B" . // 'Myanmar sign section'.
+ "\xE1\x8D\xA2" . // 'Ethiopic full stop'.
+ "\xE1\x8D\xA7" . // 'Ethiopic question mark'.
+ "\xE1\x8D\xA8" . // 'Ethiopic paragraph separator'.
+ "\xE1\x99\xAE" . // 'Canadian syllabics full stop'.
+ "\xE1\xA0\x83" . // 'Mongolian full stop'.
+ "\xE1\xA0\xA9" . // 'Mongolian manchu full stop'.
+ "\xE2\x80\xBC" . // 'Double exclamation mark'.
+ "\xE2\x80\xBD" . // 'Interrobang'.
+ "\xE2\x81\x87" . // 'Double question mark'.
+ "\xE2\x81\x88" . // 'Question exclamation mark'.
+ "\xE2\x81\x89" . // 'Exclamation question mark'.
+ "\xE3\x80\x82" . // 'Ideographic full stop'.
+ "\xEF\xB9\x92" . // 'Small full stop'.
+ "\xEF\xB9\x97" . // 'Small exclamation mark'.
+ "\xEF\xBC\x81" . // 'Fullwidth exclamation mark'.
+ "\xEF\xBC\x8E" . // 'Fullwidth full stop'.
+ "\xEF\xBC\x9E" . // 'Fullwidth question mark'.
+ "\xEF\xBD\xA1"; // 'Halfwidth ideographic full stop'.
+ // We split after Sentence_Terminal characters only if preceded by a Letter
+ // character and followed by a Separator character.
+ $sentence_splitter = '/(?<=\p{L}[' . $stops . '])(?=\p{Z})/u';
+ // If no suitable sentence break is found, we split before any Unicode
+ // separator character.
+ $word_splitter = '/(?=\p{Z})/u';
+ }
+ if ($body->nodeType === XML_TEXT_NODE) {
+ $text_length = drupal_strlen($body->textContent);
+ if ($text_length <= $size) {
+ $size -= $text_length;
+ $summary->appendChild($doc->createTextNode($body->textContent));
+ return $size;
+ }
+ // We avoid breaking text nodes within code blocks.
+ if (in_array('code', $parents)) {
+ return 0;
+ }
+ $sentences = preg_split($sentence_splitter, $body->textContent);
+ $text = '';
+ foreach ($sentences as $sentence) {
+ $sentence_length = drupal_strlen($sentence);
+ // Only add the sentence if it fits within the length limit.
+ if ($sentence_length > $size) {
+ break;
}
+ $text .= $sentence;
+ $size -= $sentence_length;
}
-
- // If a break point was found in this group, slice and stop searching.
- if ($min_rpos !== $max_rpos) {
- // Don't slice with length 0. Length must be <0 to slice from RHS.
- $summary = ($min_rpos === 0) ? $summary : substr($summary, 0, 0 - $min_rpos);
- break;
+ // If no suitable sentence break was found, try to break between words.
+ if ($text === '') {
+ $words = preg_split($word_splitter, $body->textContent);
+ foreach ($words as $word) {
+ $word_length = drupal_strlen($word);
+ // Only add the word if it fits within the length limit.
+ if ($word_length > $size) {
+ break;
+ }
+ $text .= $word;
+ $size -= $word_length;
+ }
}
+ // Append the complete sentences or words, and return 0 to indicate
+ // completion.
+ $summary->appendChild($doc->createTextNode($text));
+ return 0;
}
- // If the htmlcorrector filter is present, apply it to the generated summary.
- if (isset($filters['filter_htmlcorrector'])) {
- $summary = _filter_htmlcorrector($summary);
+ // Recurse the DOM until the summary reaches the text length limit.
+ if ($body->nodeType === XML_ELEMENT_NODE) {
+ $node = $summary->appendChild($doc->createElement($body->tagName));
+ $parents[] = $body->tagName;
+ if ($body->hasAttributes()) {
+ foreach ($body->attributes as $attributeNode) {
+ $node->setAttribute($attributeNode->nodeName, $attributeNode->value);
+ }
+ }
+ if ($body->hasChildNodes()) {
+ foreach ($body->childNodes as $child) {
+ if ($size > 0) {
+ $size = _text_summarize($child, $size, $node, $doc, $parents);
+ }
+ else {
+ break;
+ }
+ }
+ }
}
-
- return $summary;
+ return $size;
}
/**
diff --git a/core/modules/field/modules/text/text.test b/core/modules/field/modules/text/text.test
index 2d936be773ffbc2621f9d9405454dd41294d7cc7..c17f163e64f8a1a8ddfebdbad9b1f53a1ecb81b6 100644
--- a/core/modules/field/modules/text/text.test
+++ b/core/modules/field/modules/text/text.test
@@ -258,7 +258,8 @@ class TextSummaryTestCase extends DrupalWebTestCase {
*/
function testFirstSentenceQuestion() {
$text = 'A question? A sentence. Another sentence.';
- $expected = 'A question? A sentence.';
+ // The default format includes the auto-paragraph filter.
+ $expected = '
A question? A sentence.
'; $this->callTextSummary($text, $expected, NULL, 30); } @@ -270,11 +271,53 @@ class TextSummaryTestCase extends DrupalWebTestCase { 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . // 108 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. ' . // 103 'Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'; // 110 - $expected = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' . + $expected = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ' . 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ' . - 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.'; - // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word. - $this->callTextSummary($text, $expected, NULL, 340); + 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
'; + // Test that sentence splitting works when we replace the full stops with + // any other Unicode Sentence_Terminal character listed at + // http://unicode.org/review/pr-23.html. + $stops = array( + "\x21", // 'Exclamation mark'. + "\x2E", // 'Full stop'. + "\x3F", // 'Question mark'. + "\xD6\x89", // 'Armenian full stop'. + "\xD8\x9F", // 'Arabic question mark'. + "\xDB\x94", // 'Arabic full stop'. + "\xDC\x80", // 'Syriac end of paragraph'. + "\xDC\x81", // 'Syriac supralinear full stop'. + "\xDC\x82", // 'Syriac sublinear full stop'. + "\xE0\xA5\xA4", // 'Devanagari danda'. + "\xE1\x81\x8A", // 'Myanmar sign little section'. + "\xE1\x81\x8B", // 'Myanmar sign section'. + "\xE1\x8D\xA2", // 'Ethiopic full stop'. + "\xE1\x8D\xA7", // 'Ethiopic question mark'. + "\xE1\x8D\xA8", // 'Ethiopic paragraph separator'. + "\xE1\x99\xAE", // 'Canadian syllabics full stop'. + "\xE1\xA0\x83", // 'Mongolian full stop'. + "\xE1\xA0\xA9", // 'Mongolian manchu full stop'. + "\xE2\x80\xBC", // 'Double exclamation mark'. + "\xE2\x80\xBD", // 'Interrobang'. + "\xE2\x81\x87", // 'Double question mark'. + "\xE2\x81\x88", // 'Question exclamation mark'. + "\xE2\x81\x89", // 'Exclamation question mark'. + "\xE3\x80\x82", // 'Ideographic full stop'. + "\xEF\xB9\x92", // 'Small full stop'. + "\xEF\xB9\x97", // 'Small exclamation mark'. + "\xEF\xBC\x81", // 'Fullwidth exclamation mark'. + "\xEF\xBC\x8E", // 'Fullwidth full stop'. + "\xEF\xBC\x9E", // 'Fullwidth question mark'. + "\xEF\xBD\xA1", // 'Halfwidth ideographic full stop'. + ); + foreach ($stops as $stop) { + // First three sentences add up to: 336, so add one for space and then 3 to get half-way into next word. + $this->callTextSummary( + str_replace('.', $stop, $text), + str_replace('.', $stop, $expected), + NULL, + 340 + ); + } } /** @@ -286,104 +329,51 @@ class TextSummaryTestCase extends DrupalWebTestCase { // The summaries we expect text_summary() to return when $size is the index // of each array item. - // Using no text format: - $expected = array( - "\nHi\n
\n\nfolks\n
\n!\n
", - "
\n", - "
\nH", - "
\nHi", - "
\nHi\n", - "
\nHi\n<", - "
\nHi\n", - "
\nHi\n
\nHi\n", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
\nHi
", - "\nHi
", - "\nHi
", - "\nHi
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
", - "\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
\nHi\n
\n\nfolks\n
\n!\n
Hi
\nfolks
\n!
", + 1 => "", + 2 => "Hi
", + 3 => "Hi
", + 4 => "Hi
\n", + 5 => "Hi
\n", + 6 => "Hi
\n", + 7 => "Hi
\n", + 8 => "Hi
\nfolks
", + 9 => "Hi
\nfolks
", + 10 => "Hi
\nfolks
\n!
", ); // Test text_summary() for different sizes. - for ($i = 0; $i <= 37; $i++) { - $this->callTextSummary($text, $expected[$i], NULL, $i); - $this->callTextSummary($text, $expected_lb[$i], 'plain_text', $i); - $this->callTextSummary($text, $expected_lb[$i], 'filtered_html', $i); + for ($i = 0; $i <= 10; $i++) { + $this->callTextSummary($text, $expected[$i], 'filtered_html', $i); } } /** + * Test that we avoid breaking text in the middle of a CODE block. + */ + function testCode() { + $text = 'This is an example code block:' + . '$example = "Sentence one. Sentence two. Sentence three.
';
+ $expected = 'This is an example code block: