--- modules/node/node.module.orig	2010-11-20 00:34:05.880228594 -0800
+++ modules/node/node.module	2010-11-20 00:48:09.149501156 -0800
@@ -22,6 +22,12 @@
 define('NODE_BUILD_RSS', 4);
 define('NODE_BUILD_PRINT', 5);
 
+// see node_compare_teaser_and_body()
+define('NODE_TEASER_OFF', 0);
+define('NODE_TEASER_EQUAL', 1);
+define('NODE_TEASER_START', 2);
+define('NODE_TEASER_DIFF', 3);
+
 /**
  * Implementation of hook_help().
  */
@@ -292,6 +298,15 @@
  * place such as the end of a paragraph, a line break, or the end of a
  * sentence (in that order of preference).
  *
+ * @note
+ *   Note that the function uses strlen(), strpos(), etc. because it doesn't
+ *   affect the content and those functions are faster than the mb_...() functions.
+ *   Remember that when we cut a paragraph, we cut at a space or a tag so we won't
+ *   inadvertendly cut a UTF-8 character. (UTF-8 spaces are not considered as a
+ *   place where we want to cut the paragraph.) However, the $size parameter is
+ *   given in character so the function uses drupal_strlen() to compare the size
+ *   of the characters against $size.
+ *
  * @param $body
  *   The content for which a teaser will be generated.
  * @param $format
@@ -302,10 +317,12 @@
  *   The desired character length of the teaser. If omitted, the default
  *   value will be used. Ignored if the special delimiter is present
  *   in $body.
+ * @param $teaser_len
+ *   (OUT) The length of the teaser before closing the HTML tags.
  * @return
  *   The generated teaser.
  */
-function node_teaser($body, $format = NULL, $size = NULL) {
+function node_teaser($body, $format = NULL, $size = NULL, &$teaser_len = NULL) {
 
   if (!isset($size)) {
     $size = variable_get('teaser_length', 600);
@@ -316,11 +333,19 @@
 
   // If the size is zero, and there is no delimiter, the entire body is the teaser.
   if ($size == 0 && $delimiter === FALSE) {
+    // caller requested length of teaser?
+    if (isset($teaser_len)) {
+      $teaser_len = strlen($body);
+    }
     return $body;
   }
 
   // If a valid delimiter has been specified, use it to chop off the teaser.
   if ($delimiter !== FALSE) {
+    // caller requested length of teaser?
+    if (isset($teaser_len)) {
+      $teaser_len = $delimiter;
+    }
     return substr($body, 0, $delimiter);
   }
 
@@ -330,72 +355,216 @@
   if (isset($format)) {
     $filters = filter_list_format($format);
     if (isset($filters['php/0']) && strpos($body, '<?') !== FALSE) {
+      // caller requested length of teaser?
+      if (isset($teaser_len)) {
+        $teaser_len = strlen($body);
+      }
       return $body;
     }
   }
 
   // If we have a short body, the entire body is the teaser.
   if (drupal_strlen($body) <= $size) {
+    // caller requested length of teaser?
+    if (isset($teaser_len)) {
+      $teaser_len = strlen($body);
+    }
     return $body;
   }
 
-  // If the delimiter has not been specified, try to split at paragraph or
-  // sentence boundaries.
+  $filter_newline = isset($filters['filter/1']);
+  $len = strlen($body);
 
-  // The teaser may not be longer than maximum length specified. Initial slice.
-  $teaser = truncate_utf8($body, $size);
+  $p = 0;
+  $l = 0;
+  $s = array(); // stack
+  while ($p < $len && $l < $size) {
+    $last_tag = FALSE;
+    $o = strpos($body, '<', $p);
+    if ($o === FALSE) {
+      // no more tags till the end
+      $a = drupal_strlen(substr($body, $p, $len - $p)); // UTF-8 length
+      $n = $len;
+    }
+    else {
+      // count characters between previous position and
+      // beginning of tag
+      $a = drupal_strlen(substr($body, $p, $o - $p)); // UTF-8 length
 
-  // Store the actual length of the UTF8 string -- which might not be the same
-  // as $size.
-  $max_rpos = strlen($teaser);
+      ++$o; // skip the '<'
+      $n = strpos($body, '>', $o);
 
-  // How much to cut off the end of the teaser so that it doesn't end in the
-  // middle of a paragraph, sentence, or word.
-  // Initialize it to maximum in order to find the minimum.
-  $min_rpos = $max_rpos;
+      if ($body[$o] == '/') {
+        // closing tag, pop the opening tag too
+        array_pop($s);
+      }
+      elseif ($body[$n - 1] != '/') { // skip empty tags
+        // opening tag, save its name on the stack so we can close it later
+        $end_name = strpos($body, ' ', $o);
+        if ($end_name === FALSE || $end_name > $n) {
+          $end_name = $n;
+        }
+        $tag_name = substr($body, $o, $end_name - $o);
+        switch ($tag_name) { // ignore empty tags that were not properly closed
+        case 'br':
+        case 'hr':
+        case 'img':
+        case 'input':
+          break;
 
-  // Store the reverse of the teaser.  We use strpos on the reversed needle and
-  // haystack for speed and convenience.
-  $reversed = strrev($teaser);
+        default:
+          $s[] = $tag_name;
+          $last_tag = TRUE;
+          break;
 
-  // Build an array of arrays of break points grouped by preference.
-  $break_points = array();
+        }
+      }
 
-  // A paragraph near the end of sliced teaser is most preferable.
-  $break_points[] = array('</p>' => 0);
+      // skip the tag now (we assume properly opening/closing tag boundaries!)
+      if ($n === FALSE) {
+        // last tag not closed or it wasn't a tag?!
+        $n = $len;
+      }
+      else {
+        ++$n;  // skip the '>' character
+      }
+    }
 
-  // If no complete paragraph then treat line breaks as paragraphs.
-  $line_breaks = array('<br />' => 6, '<br>' => 4);
-  // Newline only indicates a line break if line break converter
-  // filter is present.
-  if (isset($filters['filter/1'])) {
-    $line_breaks["\n"] = 1;
-  }
-  $break_points[] = $line_breaks;
+    // any characters to add the to result?
+    if ($a) {
+      if ($l + $a >= $size) {
+        // the last tag did not make it in
+        if ($last_tag) {
+          array_pop($s);
+        }
+        // we've got more than we want to, search for a break point
+        $o = $p + $size - $l;
+        if ($body[$o] != ' ') while ($o > $p) {
+          switch ($body[$o - 1]) {
+          case "\xD8": // "\xD8\x9F" == arabic '?' (right to left)
+            if (!isset($body[$o]) || $body[$o] != "\x9F") {
+              // no the right sequence
+              break;
+            }
+            if ($o + 1 == $len || $body[$o + 1] == ' ') {
+              // found a break-point
+              break 2;
+            }
+            if ($body[$o + 1] == '"') {
+              $o += 2;
+              break 2;
+            }
+            break;
 
-  // If the first paragraph is too long, split at the end of a sentence.
-  $break_points[] = array('. ' => 1, '! ' => 1, '? ' => 1, '。' => 0, '؟ ' => 1);
+          case '.':
+          case '!':
+          case '?':
+            if ($o == $len || $body[$o] == ' ') {
+              // found a break-point
+              break 2;
+            }
+            if ($body[$o] == '"') {
+              ++$o;
+              break 2;
+            }
+            break;
 
-  // Iterate over the groups of break points until a break point is found.
-  foreach ($break_points as $points) {
-    // Look for each break point, starting at the end of the teaser.
-    foreach ($points as $point => $offset) {
-      // The teaser is already reversed, but the break point isn't.
-      $rpos = strpos($reversed, strrev($point));
-      if ($rpos !== FALSE) {
-        $min_rpos = min($rpos + $offset, $min_rpos);
+          case "\n":
+            if (!$filter_newline) {
+              break;
+            }
+          case ' ':
+            // found and remove the space (not that we ignore no-break spaces since we're not supposed to break there)
+            --$o;
+            break 2;
+
+          //case ... add support for other UTF-8 spaces?
+
+          case "\xE3":
+            // found the CJK ideographic full stop?
+            if (isset($body[$o + 1]) && $body[$o] == "\x80" && $body[$o + 1] == "\x82") {
+              // keep this character in full
+              $o += 2;
+              break 2;
+            }
+            break;
+
+          }
+          --$o;
+        }
+        $p = $o;
+        break;
       }
+      $l += $a;
     }
 
-    // If a break point was found in this group, slice and return the teaser.
-    if ($min_rpos !== $max_rpos) {
-      // Don't slice with length 0.  Length must be <0 to slice from RHS.
-      return ($min_rpos === 0) ? $teaser : substr($teaser, 0, 0 - $min_rpos);
-    }
+    $p = $n;
   }
 
-  // If a break point was not found, still return a teaser.
-  return $teaser;
+  $result = substr($body, 0, $p);
+  if (!empty($s)) {
+    // if closing tags are missing we very likely cut a paragraph half way
+    // therefore we add the ...
+    $result .= t(' ...');  // ellipsis
+    do {
+      $result .= '</' . array_pop($s) . '>';
+    } while (!empty($s));
+  }
+
+  // caller requested length of teaser?
+  if (isset($teaser_len)) {
+    $teaser_len = $p;
+  }
+
+  return $result;
+}
+
+/**
+ * Compare the teaser and the body together and return one of the
+ * following values:
+ *
+ * @li NODE_TEASER_OFF -- the teaser is not defined
+ * @li NODE_TEASER_EQUAL -- the teaser and body are the same (avoid read-more)
+ * @li NODE_TEASER_START -- the teaser is equal to the beginning of the body
+ * @li NODE_TEASER_DIFF -- the teaser is different from the body
+ *
+ * @param $node
+ *    The node of which the teaser and body are to be compared
+ * @return
+ *    One of the NODE_TEASER_... values
+ */
+function node_compare_teaser_and_body($node) {
+  if (!isset($node->teaser)) {
+    return NODE_TEASER_OFF;
+  }
+
+  // remove all the tags at the end because if they are closing tags
+  // then they were likely added by node_teaser().
+  $teaser = preg_replace('/(<[^>]+>)+$/', '', $node->teaser);
+  $len = strlen($teaser);
+  // node_teaser() may also add an ellipsis when it cuts a paragraph before the end
+  $ellipsis = t(' ...');
+  $l = strlen($ellipsis);
+  if ($len > $l && substr($teaser, $len - $l) == $ellipsis) {
+    $len -= $l;
+    $teaser = rtrim(substr($teaser, 0, $len));
+    $teaser = preg_replace('/(<[^>]+>)+$/', '', $teaser);
+    $len = strlen($teaser);
+  }
+  // now we can compare with the body
+  $included = $teaser == substr($node->body, 0, $len);
+
+  if ($included) {
+    // remove closing tags, the <!--break--> if present, and empty lines at the end
+    $remainder = substr($node->body, $len);
+    $remainder = preg_replace('/^(<[^>]+>[\n\r]*)+/', '', trim($remainder));
+    $remainder = str_replace('<!--break-->', '', $remainder);
+    $remainder = preg_replace('%(<p>&nbsp;</p>[\n\r]*)+$%', '', $remainder);
+    // if empty, then $teaser == $body!
+    return trim($remainder) ? NODE_TEASER_START : NODE_TEASER_EQUAL;
+  }
+
+  return NODE_TEASER_DIFF;
 }
 
 /**
@@ -829,12 +998,13 @@
   // module-provided 'teaser' form item).
   if (!isset($node->teaser)) {
     if (isset($node->body)) {
-      $node->teaser = node_teaser($node->body, isset($node->format) ? $node->format : NULL);
+      $teaser_len = -1;
+      $node->teaser = node_teaser($node->body, isset($node->format) ? $node->format : NULL, NULL, $teaser_len);
       // Chop off the teaser from the body if needed. The teaser_include
       // property might not be set (eg. in Blog API postings), so only act on
       // it, if it was set with a given value.
-      if (isset($node->teaser_include) && !$node->teaser_include && $node->teaser == substr($node->body, 0, strlen($node->teaser))) {
-        $node->body = substr($node->body, strlen($node->teaser));
+      if (isset($node->teaser_include) && !$node->teaser_include && node_compare_teaser_and_body($node) != NODE_TEASER_DIFF && $teaser_len > 0) {
+        $node->body = substr($node->body, $teaser_len);
       }
     }
     else {
@@ -1036,7 +1206,7 @@
   // First we'll overwrite the existing node teaser and body with
   // the filtered copies! Then, we'll stick those into the content
   // array and set the read more flag if appropriate.
-  $node->readmore = $node->teaser != $node->body;
+  $node->readmore = node_compare_teaser_and_body($node) != NODE_TEASER_EQUAL;
 
   if ($teaser == FALSE) {
     $node->body = check_markup($node->body, $node->format, FALSE);