--- modules/node/node.module.orig 2010-11-20 00:34:05.880228594 -0800 +++ modules/node/node.module 2010-11-20 00:48:09.149501156 -0800 @@ -22,6 +22,12 @@ define('NODE_BUILD_RSS', 4); define('NODE_BUILD_PRINT', 5); +// see node_compare_teaser_and_body() +define('NODE_TEASER_OFF', 0); +define('NODE_TEASER_EQUAL', 1); +define('NODE_TEASER_START', 2); +define('NODE_TEASER_DIFF', 3); + /** * Implementation of hook_help(). */ @@ -292,6 +298,15 @@ * place such as the end of a paragraph, a line break, or the end of a * sentence (in that order of preference). * + * @note + * Note that the function uses strlen(), strpos(), etc. because it doesn't + * affect the content and those functions are faster than the mb_...() functions. + * Remember that when we cut a paragraph, we cut at a space or a tag so we won't + * inadvertendly cut a UTF-8 character. (UTF-8 spaces are not considered as a + * place where we want to cut the paragraph.) However, the $size parameter is + * given in character so the function uses drupal_strlen() to compare the size + * of the characters against $size. + * * @param $body * The content for which a teaser will be generated. * @param $format @@ -302,10 +317,12 @@ * The desired character length of the teaser. If omitted, the default * value will be used. Ignored if the special delimiter is present * in $body. + * @param $teaser_len + * (OUT) The length of the teaser before closing the HTML tags. * @return * The generated teaser. */ -function node_teaser($body, $format = NULL, $size = NULL) { +function node_teaser($body, $format = NULL, $size = NULL, &$teaser_len = NULL) { if (!isset($size)) { $size = variable_get('teaser_length', 600); @@ -316,11 +333,19 @@ // If the size is zero, and there is no delimiter, the entire body is the teaser. if ($size == 0 && $delimiter === FALSE) { + // caller requested length of teaser? + if (isset($teaser_len)) { + $teaser_len = strlen($body); + } return $body; } // If a valid delimiter has been specified, use it to chop off the teaser. if ($delimiter !== FALSE) { + // caller requested length of teaser? + if (isset($teaser_len)) { + $teaser_len = $delimiter; + } return substr($body, 0, $delimiter); } @@ -330,72 +355,216 @@ if (isset($format)) { $filters = filter_list_format($format); if (isset($filters['php/0']) && strpos($body, '', $o); - // How much to cut off the end of the teaser so that it doesn't end in the - // middle of a paragraph, sentence, or word. - // Initialize it to maximum in order to find the minimum. - $min_rpos = $max_rpos; + if ($body[$o] == '/') { + // closing tag, pop the opening tag too + array_pop($s); + } + elseif ($body[$n - 1] != '/') { // skip empty tags + // opening tag, save its name on the stack so we can close it later + $end_name = strpos($body, ' ', $o); + if ($end_name === FALSE || $end_name > $n) { + $end_name = $n; + } + $tag_name = substr($body, $o, $end_name - $o); + switch ($tag_name) { // ignore empty tags that were not properly closed + case 'br': + case 'hr': + case 'img': + case 'input': + break; - // Store the reverse of the teaser. We use strpos on the reversed needle and - // haystack for speed and convenience. - $reversed = strrev($teaser); + default: + $s[] = $tag_name; + $last_tag = TRUE; + break; - // Build an array of arrays of break points grouped by preference. - $break_points = array(); + } + } - // A paragraph near the end of sliced teaser is most preferable. - $break_points[] = array('

' => 0); + // skip the tag now (we assume properly opening/closing tag boundaries!) + if ($n === FALSE) { + // last tag not closed or it wasn't a tag?! + $n = $len; + } + else { + ++$n; // skip the '>' character + } + } - // If no complete paragraph then treat line breaks as paragraphs. - $line_breaks = array('
' => 6, '
' => 4); - // Newline only indicates a line break if line break converter - // filter is present. - if (isset($filters['filter/1'])) { - $line_breaks["\n"] = 1; - } - $break_points[] = $line_breaks; + // any characters to add the to result? + if ($a) { + if ($l + $a >= $size) { + // the last tag did not make it in + if ($last_tag) { + array_pop($s); + } + // we've got more than we want to, search for a break point + $o = $p + $size - $l; + if ($body[$o] != ' ') while ($o > $p) { + switch ($body[$o - 1]) { + case "\xD8": // "\xD8\x9F" == arabic '?' (right to left) + if (!isset($body[$o]) || $body[$o] != "\x9F") { + // no the right sequence + break; + } + if ($o + 1 == $len || $body[$o + 1] == ' ') { + // found a break-point + break 2; + } + if ($body[$o + 1] == '"') { + $o += 2; + break 2; + } + break; - // If the first paragraph is too long, split at the end of a sentence. - $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1); + case '.': + case '!': + case '?': + if ($o == $len || $body[$o] == ' ') { + // found a break-point + break 2; + } + if ($body[$o] == '"') { + ++$o; + break 2; + } + break; - // Iterate over the groups of break points until a break point is found. - foreach ($break_points as $points) { - // Look for each break point, starting at the end of the teaser. - foreach ($points as $point => $offset) { - // The teaser is already reversed, but the break point isn't. - $rpos = strpos($reversed, strrev($point)); - if ($rpos !== FALSE) { - $min_rpos = min($rpos + $offset, $min_rpos); + case "\n": + if (!$filter_newline) { + break; + } + case ' ': + // found and remove the space (not that we ignore no-break spaces since we're not supposed to break there) + --$o; + break 2; + + //case ... add support for other UTF-8 spaces? + + case "\xE3": + // found the CJK ideographic full stop? + if (isset($body[$o + 1]) && $body[$o] == "\x80" && $body[$o + 1] == "\x82") { + // keep this character in full + $o += 2; + break 2; + } + break; + + } + --$o; + } + $p = $o; + break; } + $l += $a; } - // If a break point was found in this group, slice and return the teaser. - if ($min_rpos !== $max_rpos) { - // Don't slice with length 0. Length must be <0 to slice from RHS. - return ($min_rpos === 0) ? $teaser : substr($teaser, 0, 0 - $min_rpos); - } + $p = $n; } - // If a break point was not found, still return a teaser. - return $teaser; + $result = substr($body, 0, $p); + if (!empty($s)) { + // if closing tags are missing we very likely cut a paragraph half way + // therefore we add the ... + $result .= t(' ...'); // ellipsis + do { + $result .= ''; + } while (!empty($s)); + } + + // caller requested length of teaser? + if (isset($teaser_len)) { + $teaser_len = $p; + } + + return $result; +} + +/** + * Compare the teaser and the body together and return one of the + * following values: + * + * @li NODE_TEASER_OFF -- the teaser is not defined + * @li NODE_TEASER_EQUAL -- the teaser and body are the same (avoid read-more) + * @li NODE_TEASER_START -- the teaser is equal to the beginning of the body + * @li NODE_TEASER_DIFF -- the teaser is different from the body + * + * @param $node + * The node of which the teaser and body are to be compared + * @return + * One of the NODE_TEASER_... values + */ +function node_compare_teaser_and_body($node) { + if (!isset($node->teaser)) { + return NODE_TEASER_OFF; + } + + // remove all the tags at the end because if they are closing tags + // then they were likely added by node_teaser(). + $teaser = preg_replace('/(<[^>]+>)+$/', '', $node->teaser); + $len = strlen($teaser); + // node_teaser() may also add an ellipsis when it cuts a paragraph before the end + $ellipsis = t(' ...'); + $l = strlen($ellipsis); + if ($len > $l && substr($teaser, $len - $l) == $ellipsis) { + $len -= $l; + $teaser = rtrim(substr($teaser, 0, $len)); + $teaser = preg_replace('/(<[^>]+>)+$/', '', $teaser); + $len = strlen($teaser); + } + // now we can compare with the body + $included = $teaser == substr($node->body, 0, $len); + + if ($included) { + // remove closing tags, the if present, and empty lines at the end + $remainder = substr($node->body, $len); + $remainder = preg_replace('/^(<[^>]+>[\n\r]*)+/', '', trim($remainder)); + $remainder = str_replace('', '', $remainder); + $remainder = preg_replace('%(

 

[\n\r]*)+$%', '', $remainder); + // if empty, then $teaser == $body! + return trim($remainder) ? NODE_TEASER_START : NODE_TEASER_EQUAL; + } + + return NODE_TEASER_DIFF; } /** @@ -829,12 +998,13 @@ // module-provided 'teaser' form item). if (!isset($node->teaser)) { if (isset($node->body)) { - $node->teaser = node_teaser($node->body, isset($node->format) ? $node->format : NULL); + $teaser_len = -1; + $node->teaser = node_teaser($node->body, isset($node->format) ? $node->format : NULL, NULL, $teaser_len); // Chop off the teaser from the body if needed. The teaser_include // property might not be set (eg. in Blog API postings), so only act on // it, if it was set with a given value. - if (isset($node->teaser_include) && !$node->teaser_include && $node->teaser == substr($node->body, 0, strlen($node->teaser))) { - $node->body = substr($node->body, strlen($node->teaser)); + if (isset($node->teaser_include) && !$node->teaser_include && node_compare_teaser_and_body($node) != NODE_TEASER_DIFF && $teaser_len > 0) { + $node->body = substr($node->body, $teaser_len); } } else { @@ -1036,7 +1206,7 @@ // First we'll overwrite the existing node teaser and body with // the filtered copies! Then, we'll stick those into the content // array and set the read more flag if appropriate. - $node->readmore = $node->teaser != $node->body; + $node->readmore = node_compare_teaser_and_body($node) != NODE_TEASER_EQUAL; if ($teaser == FALSE) { $node->body = check_markup($node->body, $node->format, FALSE);