Index: modules/filter/filter.module
===================================================================
RCS file: /cvs/drupal/drupal/modules/filter/filter.module,v
retrieving revision 1.261
diff -u -p -r1.261 filter.module
--- modules/filter/filter.module	13 Jun 2009 19:37:27 -0000	1.261
+++ modules/filter/filter.module	28 Jun 2009 21:23:01 -0000
@@ -731,24 +731,98 @@ function _filter_url_settings($format) {
 }
 
 /**
- * URL filter. Automatically converts text web addresses (URLs, e-mail addresses,
- * ftp links, etc.) into hyperlinks.
+ * URL filter. Automatically converts text into hyperlinks.
+ *
+ * This filter identifies and makes clickable three types of "links".
+ * - URLs like http://example.com.
+ * - E-mail addresses like name@example.com.
+ * - Web addresses without the "http://" protocol defined, like www.example.com.
+ * Each type must be processed separately, as there is no one regular
+ * expression that could possibly match all of the cases in one pass.
  */
 function _filter_url($text, $format) {
-  // Pass length to regexp callback
-  _filter_url_trim(NULL, variable_get('filter_url_length_' . $format, 72));
+  // List of tags - the content of which must be skipped.
+  $ignore_tags = 'a|script|style|code';
 
-  $text = ' ' . $text . ' ';
+  // Create an array which contains the regexps for each type of link.
+  // The key to the regexp is the name of a function that is used as
+  // callback function to process matches of the regexp. The callback function
+  // is to return the replacement for the match. The array is used and 
+  // matching/replacement done below inside some loops.
+  $tasks = NULL;
 
   // Match absolute URLs.
-  $text = preg_replace_callback("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '_filter_url_parse_full_links', $text);
+  $protocols = 'http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://';
+  $url_pattern = "(?:$protocols)(?:[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-])";
+  $replacement = "`($url_pattern)([\.\,\?\!]*?)`i";
+  $tasks['_filter_url_parse_full_links'] = $replacement;
 
   // Match e-mail addresses.
-  $text = preg_replace("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '\1<a href="mailto:\2">\2</a>\3', $text);
+  // Note: The ICANN seems to be on track towards accepting more diverse top level domains,
+  // so this pattern has been "future-proofed" to allow for TLD's of length 2-64.
+  $url_pattern = '[A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,64}';
+  $replacement = "`($url_pattern)`i";
+  $tasks['_filter_url_parse_email_links'] = $replacement;
 
   // Match www domains/addresses.
-  $text = preg_replace_callback("`(<p>|<li>|[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '_filter_url_parse_partial_links', $text);
-  $text = substr($text, 1, -1);
+  $url_pattern = 'www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-]';
+  $replacement = "`($url_pattern)([\.\,\?\!]*?)`i";
+  $tasks['_filter_url_parse_partial_links'] = $replacement;
+
+
+  // Pass length to regexp callback.
+  _filter_url_trim(NULL, variable_get('filter_url_length_' . $format, 72));
+
+  // We need to process each case of replacement type separately.
+  // The text must be joined and split again after each
+  // replacement, since replacements create new HTML tags and the new
+  // tags must be correctly protected before the next replacement can be done.
+  foreach ($tasks as $task => $replacement) {
+    // Split at all tags.
+    // This ensures that nothing that is a tagname or attribute will be processed.
+    $chunks = preg_split('/(<.+?>)/i', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
+    // Note: PHP ensures the array consists of alternating delimiters and literals
+    // and begins and ends with a literal (inserting NULL as required).
+    // Therefore, first chunk is always text:
+    $chunk_type = 'text';
+    // Tags to ignore are defined in $ignore_tags (see above).
+    // If an ignore_tag is found, it is stored here and removed only when the
+    // closing tag is found. Until the closing tag is found, no replacements are made.
+    $open_tag = '';
+
+    for ($i = 0; $i < count($chunks); $i++) {
+      if ($chunk_type == 'text') {
+        // Only do replacements when there are no unclosed ignore_tags.
+        if ($open_tag == '') {
+          // This is the high point of this function! If there is a match,
+          // a link is created in the callback function named by $task.
+          $chunks[$i] = preg_replace_callback($replacement, $task, $chunks[$i]);
+        }
+        // Done processing text chunk, so next chunk is a tag.
+        $chunk_type = 'tag';
+      }
+      else {
+        if ($open_tag == '') {
+          // No open ignore_tags. Process this tag...
+          if (preg_match("`<($ignore_tags)(?:\s|>)`i", $chunks[$i], $matches)) {
+            // This matches one of the $ignore_tags.
+            // Catch and store the tag in question.
+            $open_tag = $matches[1];
+          }
+        }
+        else {
+          // There is an $ignore_tag open. See if this is a matching closing tag.
+          // Nothing else is done until we find the closing tag.
+          if (preg_match("`<\/$open_tag>`i", $chunks[$i], $matches)) {
+            $open_tag = '';
+          }
+        }
+        // Done processing tag chunk, so next chunk is text.
+        $chunk_type = 'text';
+      }
+    }
+  $text = implode($chunks);
+  }
 
   return $text;
 }
@@ -828,23 +902,42 @@ function _filter_htmlcorrector($text) {
 }
 
 /**
- * Make links out of absolute URLs.
+ * Callback function. Make links out of absolute URLs.
  */
 function _filter_url_parse_full_links($match) {
-  $match[2] = decode_entities($match[2]);
-  $caption = check_plain(_filter_url_trim($match[2]));
-  $match[2] = check_url($match[2]);
-  return $match[1] . '<a href="' . $match[2] . '" title="' . $match[2] . '">' . $caption . '</a>' . $match[5];
+  // The $i:th parenthesis in the regexp contains the URL.
+  $i = 1;
+
+  $match[$i] = decode_entities($match[$i]);
+  $caption = check_plain(_filter_url_trim($match[$i]));
+  $match[$i] = check_url($match[$i]);
+  return '<a href="' . $match[$i] . '" title="' . $match[$i] . '">' . $caption . '</a>' . $match[$i+1];
 }
 
 /**
- * Make links out of domain names starting with "www."
+ * Callback function. Make links out of e-mail addresses.
+ */
+function _filter_url_parse_email_links($match) {
+  // The $i:th parenthesis in the regexp contains the URL.
+  $i = 0;
+
+  $match[$i] = decode_entities($match[$i]);
+  $caption = check_plain(_filter_url_trim($match[$i]));
+  $match[$i] = check_url($match[$i]);
+  return '<a href="mailto:' . $match[$i] . '" title="' . $match[$i] . '">' . $caption . '</a>';
+}
+
+/**
+ * Callback function. Make links out of domain names starting with "www.".
  */
 function _filter_url_parse_partial_links($match) {
-  $match[2] = decode_entities($match[2]);
-  $caption = check_plain(_filter_url_trim($match[2]));
-  $match[2] = check_plain($match[2]);
-  return $match[1] . '<a href="http://' . $match[2] . '" title="' . $match[2] . '">' . $caption . '</a>' . $match[3];
+  // The $i:th parenthesis in the regexp contains the URL.
+  $i = 1;
+
+  $match[$i] = decode_entities($match[$i]);
+  $caption = check_plain(_filter_url_trim($match[$i]));
+  $match[$i] = check_plain($match[$i]);
+  return '<a href="http://' . $match[$i] . '" title="' . $match[$i] . '">' . $caption . '</a>' . $match[$i+1];
 }
 
 /**
Index: modules/filter/filter.test
===================================================================
RCS file: /cvs/drupal/drupal/modules/filter/filter.test,v
retrieving revision 1.25
diff -u -p -r1.25 filter.test
--- modules/filter/filter.test	28 Jun 2009 18:03:56 -0000	1.25
+++ modules/filter/filter.test	28 Jun 2009 21:23:02 -0000
@@ -582,6 +582,13 @@ class FilterTestCase extends DrupalWebTe
 
     $f = _filter_url('www.example.com/index.php?a=.', 'f');
     $this->assertEqual($f, '<a href="http://www.example.com/index.php?a=" title="www.example.com/index.php?a=">www.example.com/index.php?a=</a>.', t('Converting URLs -- do forget about a dot at the end of a query string.'));
+
+    // Confirm that URLs in comments are not processed.
+    $filtered = _filter_url('<!-- http://www.example.com -->', 'f');
+    $this->assertEqual($filtered, '<!-- http://www.example.com -->', t('Converting URLs -- do not change comments.'));
+
+    $filtered = _filter_url('<blockquote>http://www.example.com</blockquote>', 'f'); // Not so sure if it should or it shouldn't convert these.
+    $this->assertEqual($filtered, '<blockquote><a href="http://www.example.com" title="http://www.example.com">http://www.example.com</a></blockquote>', t('Converting URLs -- blockquote handling.'));
   }
 
   /**
