Index: modules/filter/filter.module
===================================================================
RCS file: /cvs/drupal/drupal/modules/filter/filter.module,v
retrieving revision 1.219
diff -u -p -r1.219 filter.module
--- modules/filter/filter.module	14 Aug 2008 11:58:06 -0000	1.219
+++ modules/filter/filter.module	15 Aug 2008 08:59:55 -0000
@@ -306,8 +306,8 @@ function filter_formats($index = NULL) {
       $args[] = variable_get('filter_default_format', 1);
     }
 
-    $result = db_query($query . ' ORDER by weight', $args);
-    while ($format = db_fetch_object($result)) {
+    $replacementsult = db_query($query . ' ORDER by weight', $args);
+    while ($format = db_fetch_object($replacementsult)) {
       $formats[$format->format] = $format;
     }
   }
@@ -370,8 +370,8 @@ function filter_list_format($format) {
 
   if (!isset($filters[$format])) {
     $filters[$format] = array();
-    $result = db_query("SELECT * FROM {filters} WHERE format = %d ORDER BY weight, module, delta", $format);
-    while ($filter = db_fetch_object($result)) {
+    $replacementsult = db_query("SELECT * FROM {filters} WHERE format = %d ORDER BY weight, module, delta", $format);
+    while ($filter = db_fetch_object($replacementsult)) {
       $list = module_invoke($filter->module, 'filter', 'list');
       if (isset($list) && is_array($list) && isset($list[$filter->delta])) {
         $filter->name = $list[$filter->delta];
@@ -714,24 +714,99 @@ function _filter_url_settings($format) {
 }
 
 /**
- * URL filter. Automatically converts text web addresses (URLs, e-mail addresses,
- * ftp links, etc.) into hyperlinks.
+ * URL filter. Automatically converts text web addresses (URLs, e-mail
+ * addresses, ftp links, etc.) into hyperlinks.
+ *
+ * This filter identifies and makes clickable three types of "links".
+ * - URLs like http://example.com.
+ * - E-mail addresses like name@example.com.
+ * - Web addresses without the "http://" protocol defined, like www.example.com.
+ * Each type must be processed separately, as there is no one regular
+ * expression that could possibly match all of the cases in one pass.
  */
 function _filter_url($text, $format) {
-  // Pass length to regexp callback
-  _filter_url_trim(NULL, variable_get('filter_url_length_' . $format, 72));
+  // List of tags - the content of which must be skipped.
+  $ignoretags = 'a|script|style|code';
 
-  $text = ' ' . $text . ' ';
+  // Create an array which contains the regexps for each type of link.
+  // The key to the regexp is the name of a function that is used as
+  // callback function to process matches of the regexp. The callback function
+  // is to return the replacement for the match. The array is used and 
+  // matching/replacement done below inside some loops.
+  $tasks = NULL;
 
   // Match absolute URLs.
-  $text = preg_replace_callback("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '_filter_url_parse_full_links', $text);
+  $protocols = 'http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://';
+  $url_pattern = "(?:$protocols)(?:[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-])";
+  $replacement = "`($url_pattern)([\.\,\?\!]*?)`i";
+  $tasks['_filter_url_parse_full_links'] = $replacement;
 
   // Match e-mail addresses.
-  $text = preg_replace("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '\1<a href="mailto:\2">\2</a>\3', $text);
+  // Note: The ICANN seems to be on track towards accepting more diverse top level domains,
+  // so this pattern has been "future-proofed" to allow for TLD's of length 2-64.
+  $url_pattern = '[A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,64}';
+  $replacement = "`($url_pattern)`i";
+  $tasks['_filter_url_parse_email_links'] = $replacement;
 
   // Match www domains/addresses.
-  $text = preg_replace_callback("`(<p>|<li>|[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '_filter_url_parse_partial_links', $text);
-  $text = substr($text, 1, -1);
+  $url_pattern = 'www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-]';
+  $replacement = "`($url_pattern)([\.\,\?\!]*?)`i";
+  $tasks['_filter_url_parse_partial_links'] = $replacement;
+
+
+  // Pass length to regexp callback.
+  _filter_url_trim(NULL, variable_get('filter_url_length_' . $format, 72));
+
+  // We need to process each case of replacement type separately.
+  // The text must be joined and split again after each
+  // replacement, since replacements create new HTML tags and the new
+  // tags must be correctly protected before the next replacement can be done.
+  foreach ($tasks as $task => $replacement) {
+    // Split at all tags.
+    // This ensures that nothing that is a tagname or attribute will be processed.
+    $chunks = preg_split('/(<.+?>)/i', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
+    // Note: PHP ensures the array consists of alternating delimiters and literals
+    // and begins and ends with a literal (inserting NULL as required).
+    // Therefore, first chunk is always text:
+    $chunk_type = 'text';
+    // Tags to ignore are defined in $ignoretags (see above).
+    // If an ignoretag is found, it is stored here and removed only when the
+    // closing tag is found. Until the closing tag is found, no replacements are made.
+    $opentag = '';
+
+    for ($i = 0; $i < count($chunks); $i++) {
+      if ($chunk_type == 'text') {
+        // Only do replacements when there are no unclosed ignoretags.
+        if ($opentag == '') {
+          // This is the high point of this function! If there is a match,
+          // a link is created in the callback function named by $task.
+          $chunks[$i] = preg_replace_callback($replacement, $task, $chunks[$i]);
+        }
+        // Done processing text chunk, so next chunk is a tag.
+        $chunk_type = 'tag';
+      }
+      else {
+        if ($opentag == '') {
+          // No open ignoretags. Process this tag...
+          if (preg_match("`<($ignoretags)(?:\s|>)`i", $chunks[$i], $matches)) {
+            // This matches one of the $ignoretags.
+            // Catch and store the tag in question.
+            $opentag = $matches[1];
+          }
+        }
+        else {
+          // There is an $ignoretag open. See if this is a matching closing tag.
+          // Nothing else is done until we find the closing tag.
+          if (preg_match("`<\/$opentag>`i", $chunks[$i], $matches)) {
+            $opentag = '';
+          }
+        }
+        // Done processing tag chunk, so next chunk is text.
+        $chunk_type = 'text';
+      }
+    }
+  $text = implode($chunks);
+  }
 
   return $text;
 }
@@ -811,23 +886,42 @@ function _filter_htmlcorrector($text) {
 }
 
 /**
- * Make links out of absolute URLs.
+ * Callback function. Make links out of absolute URLs.
  */
 function _filter_url_parse_full_links($match) {
-  $match[2] = decode_entities($match[2]);
-  $caption = check_plain(_filter_url_trim($match[2]));
-  $match[2] = check_url($match[2]);
-  return $match[1] . '<a href="' . $match[2] . '" title="' . $match[2] . '">' . $caption . '</a>' . $match[5];
+  // The $i:th parenthesis in the regexp contains the URL.
+  $i = 1;
+
+  $match[$i] = decode_entities($match[$i]);
+  $caption = check_plain(_filter_url_trim($match[$i]));
+  $match[$i] = check_url($match[$i]);
+  return '<a href="' . $match[$i] . '" title="' . $match[$i] . '">' . $caption . '</a>' . $match[$i+1];
 }
 
 /**
- * Make links out of domain names starting with "www."
+ * Callback function. Make links out of e-mail addresses.
+ */
+function _filter_url_parse_email_links($match) {
+  // The $i:th parenthesis in the regexp contains the URL.
+  $i = 0;
+
+  $match[$i] = decode_entities($match[$i]);
+  $caption = check_plain(_filter_url_trim($match[$i]));
+  $match[$i] = check_url($match[$i]);
+  return '<a href="mailto:' . $match[$i] . '" title="' . $match[$i] . '">' . $caption . '</a>';
+}
+
+/**
+ * Callback function. Make links out of domain names starting with "www.".
  */
 function _filter_url_parse_partial_links($match) {
-  $match[2] = decode_entities($match[2]);
-  $caption = check_plain(_filter_url_trim($match[2]));
-  $match[2] = check_plain($match[2]);
-  return $match[1] . '<a href="http://' . $match[2] . '" title="' . $match[2] . '">' . $caption . '</a>' . $match[3];
+  // The $i:th parenthesis in the regexp contains the URL.
+  $i = 1;
+
+  $match[$i] = decode_entities($match[$i]);
+  $caption = check_plain(_filter_url_trim($match[$i]));
+  $match[$i] = check_plain($match[$i]);
+  return '<a href="http://' . $match[$i] . '" title="' . $match[$i] . '">' . $caption . '</a>' . $match[$i+1];
 }
 
 /**
Index: modules/filter/filter.test
===================================================================
RCS file: /cvs/drupal/drupal/modules/filter/filter.test,v
retrieving revision 1.6
diff -u -p -r1.6 filter.test
--- modules/filter/filter.test	15 Aug 2008 07:49:42 -0000	1.6
+++ modules/filter/filter.test	15 Aug 2008 08:59:55 -0000
@@ -206,6 +206,122 @@ class FilterTestCase extends DrupalWebTe
   }
 
   /**
+   * Test the URL filter
+   */
+  function testUrlFilter() {
+    $url_filter = 2;
+
+    $format = $this->createFormat($url_filter);
+
+$body=<<<END
+Testing wwwstring with period at end www.example1.com. Testing email with period at end person@example2.com. Testing HTTP URL with period at end http://www.example3.com. Also test <code>using www.example4.com the code tag</code>.
+
+<blockquote>
+Test inside blockquote tag www.example5.com. email with person@example6.com. and url http://www.example7.com. And also <code>using www.example8.com the code tag and also inside <em>www.example9.com em tags</em> bla bla</code>.
+</blockquote>
+
+<code>One more simple code tag test? http://www.example10.com abc</code>
+
+Test the really simple cases next:
+
+http://www.example11.com
+www.example12.com
+person@example13.com
+<code>www.example14.com</code>
+
+What about tags that don't exist <x>like x say www.example15.com</x>? And what about tag <pooh>beginning www.example16.com with p?</pooh>
+
+Test &lt;br/&gt;: This is a www.example17.com. example <strong>with</strong> some http://www.example18.com various tags within the paragraph. *<br/> Also it is important www.example19.com to *<br/> test multiple different url's and wwwstrings http://www.example20.com urls in same paragraph. *<br/>I mean it www.example21.com many of them person@example22.com after each http://www.example23.com other *img*<img/> abc. This is just a www.example24.com paragraph with some http://www.example25.com urls thrown in. This is just a www.example26.com paragraph person@example27.com with some http://www.example28.com urls thrown in.
+
+<script>
+<!--
+  //Anything inside a javascript section should not be converted
+  exampleurl = "http://www.example29.com";
+-->
+</script>
+
+Again some simple tests inside various tags:
+
+<a href="foo">http://www.example30.com</a>
+<strong>http://www.example31.com</strong>
+<em>http://www.example32.com</em>
+
+And also example ftp URL ftp://ftp.example33.com.
+
+The old URL filter has problems with <a title="kind of link www.example41.com with text" href="http://www.example42.com">this kind of link</a> with www address as part of text in title. www.example43.com
+
+<dl>
+<dt>www.example44.com</dt>
+<dd>http://www.example45.com</dd>
+<dd>person@example46.com</dd>
+<dt>check www.example47.com</dt>
+<dd>this with some text around: http://www.example48.com not so easy person@example49.com now?</dd>
+</dl>
+
+<!-- This url www.example50.com is inside a comment -->
+
+hello.... there!
+END;
+
+
+
+    $edit = array();
+    $edit['title'] = $this->randomName();
+    $edit['body'] = $body;
+    $edit['format'] = $format->format;
+    $edit['type'] = 'page';
+    $page = $this->drupalCreateNode($edit);
+
+    $this->drupalGet('node/' . $page->nid);
+    $this->assertRaw('href="http://www.example1.com"', t('Parse simple www-string but not the end-of-sentence period.'));
+    $this->assertRaw('href="mailto:person@example2.com"', t('Parse simple email string but not the end-of-sentence period.'));
+    $this->assertRaw('href="http://www.example3.com"', t('Parse simple HTTP URL but not the end-of-sentence period.'));
+    $this->assertNoRaw('href="http://www.example4.com"', t('Do not parse simple HTTP URL inside code tags.'));
+    $this->assertRaw('href="http://www.example5.com"', t('Parse www-string inside blockquote tag.'));
+    $this->assertRaw('href="mailto:person@example6.com"', t('Parse email string inside blockquote tag.'));
+    $this->assertRaw('href="http://www.example7.com"', t('Parse HTTP URL inside blockquote tag'));
+    $this->assertNoRaw('href="http://www.example8.com"', t('Do not parse simple HTTP URL inside code tags.'));
+    $this->assertNoRaw('href="http://www.example9.com"', t('Do not parse simple HTTP URL inside em nested inside code tags.'));
+    $this->assertNoRaw('href="http://www.example10.com"', t('Do not parse simple HTTP URL inside code tags.'));
+    $this->assertRaw('href="http://www.example11.com"', t('Parse simple HTTP URL.'));
+    $this->assertRaw('href="http://www.example12.com"', t('Parse simple www-string.'));
+    $this->assertRaw('href="mailto:person@example13.com"', t('Parse simple email string.'));
+    $this->assertNoRaw('href="http://www.example14.com"', t('Do not parse simple HTTP URL inside code tags.'));
+    $this->assertRaw('href="http://www.example15.com"', t('Parse www-string inside tag not part of HTML spec ( <x> ).'));
+    $this->assertRaw('href="http://www.example16.com"', t('Parse www-string inside tag not part of HTML spec but beginning with p ( <pooh> ).'));
+    $this->assertRaw('href="http://www.example17.com"', t('Parse multiple www-strings inside same paragraph.'));
+    $this->assertRaw('href="http://www.example18.com"', t('Parse multiple www-strings inside same paragraph.'));
+    $this->assertRaw('href="http://www.example19.com"', t('Parse multiple www-strings inside same paragraph.'));
+    $this->assertRaw('href="http://www.example20.com"', t('Parse multiple www-strings inside same paragraph limited with <br>.'));
+    $this->assertRaw('href="http://www.example21.com"', t('Parse multiple www-strings inside same paragraph limited with <br>.'));
+    $this->assertRaw('href="mailto:person@example22.com"', t('Parse email string with multiple www-strings inside same paragraph limited with <br>.'));
+    $this->assertRaw('href="http://www.example23.com"', t('Parse multiple www-strings inside same paragraph limited with <br>.'));
+    $this->assertRaw('href="http://www.example24.com"', t('Parse multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertRaw('href="http://www.example25.com"', t('Parse multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertRaw('href="http://www.example26.com"', t('Parse multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertRaw('href="mailto:person@example27.com"', t('Parse email string with multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertRaw('href="http://www.example28.com"', t('Parse multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertNoRaw('href="http://www.example29.com"', t('Do not parse URL inside a script element (part of javascript code).'));
+    $this->assertNoRaw('href="http://www.example30.com"', t('Do not parse URL inside an a element.'));
+    $this->assertRaw('href="http://www.example31.com"', t('Parse URL inside strong tag.'));
+    $this->assertRaw('href="http://www.example32.com"', t('Parse URL inside em tag.'));
+    $this->assertRaw('href="ftp://ftp.example33.com"', t('Parse ftp:// URL.'));
+    $this->assertNoRaw('href="http://www.example41.com"', t('Do not parse www-strings inside an a element title attribute.'));
+    $this->assertNoRaw('<a href="http://www.example42.com"', t('Do not parse URL that is already the href attribute of a link.'));
+    $this->assertRaw('href="http://www.example44.com"', t('Parse www-string inside dl dt tags.'));
+    $this->assertRaw('href="http://www.example45.com"', t('Parse URL inside dl dd tags.'));
+    $this->assertRaw('href="mailto:person@example46.com"', t('Parse email string inside dl dd tags.'));
+    $this->assertRaw('href="http://www.example47.com"', t('Parse www-string with text inside dl dd tags.'));
+    $this->assertRaw('href="http://www.example48.com"', t('Parse URL with text inside dl dd tags.'));
+    $this->assertRaw('href="mailto:person@example49.com"', t('Parse email string with text inside dl dd tags.'));
+    $this->assertNoRaw('href="http://www.example50.com"', t('Do not parse URL that is inside HTML comment.'));
+    $this->assertRaw('hello.... there!', t('Verify that last part of normal text is preserved intact.'));
+
+
+    $this->deleteFormat($format);
+  }
+
+  /**
    * Test the line break filter
    */
   function testLineBreakFilter() {