From 5d83f55f07f7923d06666a22d47a174d8d4e53d7 Mon Sep 17 00:00:00 2001
From: Bob Vincent
tag, but if the original + // formatted version was plaintext, then the summary should be plaintext + // also. + if ($body === strip_tags($body)) { + $output = strip_tags($output); + } + return trim($output). "\r\n\r\n"; +} - // Iterate over the groups of break points until a break point is found. - foreach ($break_points as $points) { - // Look for each break point, starting at the end of the teaser. - foreach ($points as $point => $offset) { - // The teaser is already reversed, but the break point isn't. - $rpos = strpos($reversed, strrev($point)); - if ($rpos !== FALSE) { - $min_rpos = min($rpos + $offset, $min_rpos); +/** + * Helper function for text_summary. + * + * Recursively copies elements from $body to $summary, subtracting the length + * of the textContent portions from $size until $size reaches zero. + * + * @param $body + * The source DOMNode. + * @param $size + * The maximum number of textContent characters to copy. + * @param $summary + * The destination DOMNode. + * @param $doc + * The destination DOMDocument. Should be the same as the + * $summary->ownerDocument property. + * @param $parents + * An array of tag names of ancestor nodes. + * + * @return + * The number of additional characters left to copy. + */ +function _text_summarize($body, $size, $summary, $doc, $parents = array()) { + static $sentence_splitter; + static $word_splitter; + if (!isset($sentence_splitter)) { + // According to http://unicode.org/review/pr-23.html, these are the Unicode + // Sentence_Terminal characters. + $stops = + "\x21" . // 'Exclamation mark'. + "\x2E" . // 'Full stop'. + "\x3F" . // 'Question mark'. + "\xD6\x89" . // 'Armenian full stop'. + "\xD8\x9F" . // 'Arabic question mark'. + "\xDB\x94" . // 'Arabic full stop'. + "\xDC\x80" . // 'Syriac end of paragraph'. + "\xDC\x81" . // 'Syriac supralinear full stop'. + "\xDC\x82" . // 'Syriac sublinear full stop'. + "\xE0\xA5\xA4" . // 'Devanagari danda'. + "\xE1\x81\x8A" . // 'Myanmar sign little section'. + "\xE1\x81\x8B" . // 'Myanmar sign section'. + "\xE1\x8D\xA2" . // 'Ethiopic full stop'. + "\xE1\x8D\xA7" . // 'Ethiopic question mark'. + "\xE1\x8D\xA8" . // 'Ethiopic paragraph separator'. + "\xE1\x99\xAE" . // 'Canadian syllabics full stop'. + "\xE1\xA0\x83" . // 'Mongolian full stop'. + "\xE1\xA0\xA9" . // 'Mongolian manchu full stop'. + "\xE2\x80\xBC" . // 'Double exclamation mark'. + "\xE2\x80\xBD" . // 'Interrobang'. + "\xE2\x81\x87" . // 'Double question mark'. + "\xE2\x81\x88" . // 'Question exclamation mark'. + "\xE2\x81\x89" . // 'Exclamation question mark'. + "\xE3\x80\x82" . // 'Ideographic full stop'. + "\xEF\xB9\x92" . // 'Small full stop'. + "\xEF\xB9\x97" . // 'Small exclamation mark'. + "\xEF\xBC\x81" . // 'Fullwidth exclamation mark'. + "\xEF\xBC\x8E" . // 'Fullwidth full stop'. + "\xEF\xBC\x9E" . // 'Fullwidth question mark'. + "\xEF\xBD\xA1"; // 'Halfwidth ideographic full stop'. + // We split after Sentence_Terminal characters only if preceded by a Letter + // character and followed by a Separator character. + $sentence_splitter = '/(?<=\p{L}[' . $stops . '])(?=\p{Z})/u'; + // If no suitable sentence break is found, we split before any Unicode + // Separator character. + $word_splitter = '/(?=\p{Z})/u'; + } + if ($body->nodeType === XML_TEXT_NODE) { + $text_length = drupal_strlen($body->textContent); + if ($text_length <= $size) { + $size -= $text_length; + $summary->appendChild($doc->createTextNode($body->textContent)); + return $size; + } + // We avoid breaking text nodes within code blocks. + if (in_array('code', $parents)) { + // Return zero to avoid adding subsequent text nodes. + return 0; + } + $sentences = preg_split($sentence_splitter, $body->textContent); + $text = ''; + foreach ($sentences as $sentence) { + $sentence_length = drupal_strlen($sentence); + // Only add the sentence if it fits within the length limit. + if ($sentence_length > $size) { + break; } + $text .= $sentence; + $size -= $sentence_length; } - - // If a break point was found in this group, slice and return the teaser. - if ($min_rpos !== $max_rpos) { - // Don't slice with length 0. Length must be <0 to slice from RHS. - return ($min_rpos === 0) ? $teaser : substr($teaser, 0, 0 - $min_rpos); + // If no suitable sentence break was found, try to break between words. + if ($text === '') { + $words = preg_split($word_splitter, $body->textContent); + foreach ($words as $word) { + $word_length = drupal_strlen($word); + // Only add the word if it fits within the length limit. + if ($word_length > $size) { + break; + } + $text .= $word; + $size -= $word_length; + } } + $summary->appendChild($doc->createTextNode($text)); + // Return zero to avoid adding subsequent text nodes. + return 0; } - - // If a break point was not found, still return a teaser. - return $teaser; + if ($body->hasChildNodes()) { + $node = $summary->appendChild($doc->createElement($body->tagName)); + $parents[] = $body->tagName; + foreach ($body->childNodes as $child) { + if ($size > 0) { + $size = _text_summarize($child, $size, $node, $doc, $parents); + } + else { + break; + } + } + } + return $size; } /** @@ -1269,10 +1361,10 @@ function node_search($op = 'search', $keys = NULL) { $join2 .= ' LEFT JOIN {node_counter} nc ON nc.nid = i.sid'; $total += $weight; } - - // When all search factors are disabled (ie they have a weight of zero), - // the default score is based only on keyword relevance and there is no need to - // adjust the score of each item. + + // When all search factors are disabled (ie they have a weight of zero), + // the default score is based only on keyword relevance and there is no need to + // adjust the score of each item. if ($total == 0) { $select2 = 'i.relevance AS score'; $total = 1; @@ -1280,7 +1372,7 @@ function node_search($op = 'search', $keys = NULL) { else { $select2 = implode(' + ', $ranking) . ' AS score'; } - + // Do search. $find = do_search($keys, 'node', 'INNER JOIN {node} n ON n.nid = i.sid '. $join1, $conditions1 . (empty($where1) ? '' : ' AND '. $where1), $arguments1, $select2, $join2, $arguments2); @@ -1708,7 +1800,7 @@ function node_feed($nids = FALSE, $channel = array()) { $item->body = $content; unset($item->teaser); } - + // Allow modules to modify the fully-built node. node_invoke_nodeapi($item, 'alter', $teaser, FALSE); } -- 1.7.5.4