Index: modules/filter/filter.module
===================================================================
RCS file: /cvs/drupal/drupal/modules/filter/filter.module,v
retrieving revision 1.336
diff -u -p -r1.336 filter.module
--- modules/filter/filter.module	1 Aug 2010 19:50:33 -0000	1.336
+++ modules/filter/filter.module	16 Aug 2010 16:55:33 -0000
@@ -1291,46 +1291,145 @@ function _filter_url_settings($form, &$f
 }
 
 /**
- * URL filter. Automatically converts text web addresses (URLs, e-mail addresses,
- * ftp links, etc.) into hyperlinks.
+ * URL filter. Automatically converts text into hyperlinks.
+ *
+ * This filter identifies and makes clickable three types of "links".
+ * - URLs like http://example.com.
+ * - E-mail addresses like name@example.com.
+ * - Web addresses without the "http://" protocol defined, like www.example.com.
+ * Each type must be processed separately, as there is no one regular
+ * expression that could possibly match all of the cases in one pass.
  */
 function _filter_url($text, $filter) {
+  // List of tags - the content of which must be skipped.
+  $ignore_tags = 'a|script|style|code|pre';
   // Pass length to regexp callback
   _filter_url_trim(NULL, $filter->settings['filter_url_length']);
 
-  $text = ' ' . $text . ' ';
+  // Create an array which contains the regexps for each type of link.
+  // The key to the regexp is the name of a function that is used as
+  // callback function to process matches of the regexp. The callback function
+  // is to return the replacement for the match. The array is used and
+  // matching/replacement done below inside some loops.
+  $tasks = array();
 
   // Match absolute URLs.
-  $text = preg_replace_callback("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '_filter_url_parse_full_links', $text);
+  $protocols = 'http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://';
+  $url_pattern = "(?:$protocols)(?:[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-])";
+  $replacement = "`($url_pattern)([\.\,\?\!]*?)`i";
+  $tasks['_filter_url_parse_full_links'] = $replacement;
 
   // Match e-mail addresses.
-  $text = preg_replace("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '\1<a href="mailto:\2">\2</a>\3', $text);
+  // Note: The ICANN seems to be on track towards accepting more diverse top
+  // level domains, so this pattern has been "future-proofed" to allow for TLD's
+  // of length 2-64.
+  $url_pattern = '[A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,64}';
+  $replacement = "`($url_pattern)`i";
+  $tasks['_filter_url_parse_email_links'] = $replacement;
 
   // Match www domains/addresses.
-  $text = preg_replace_callback("`(<p>|<li>|[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '_filter_url_parse_partial_links', $text);
-  $text = substr($text, 1, -1);
+  $url_pattern = 'www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-]';
+  $replacement = "`($url_pattern)([\.\,\?\!]*?)`i";
+  $tasks['_filter_url_parse_partial_links'] = $replacement;
+
+
+  // Pass length to regexp callback.
+  _filter_url_trim(NULL, $filter->settings['filter_url_length']);
+
+  // We need to process each case of replacement type separately.
+  // The text must be joined and split again after each
+  // replacement, since replacements create new HTML tags and the new
+  // tags must be correctly protected before the next replacement can be done.
+  foreach ($tasks as $task => $replacement) {
+    // Split at all tags.
+    // This ensures that nothing that is a tagname or attribute will be
+    // processed.
+    $chunks = preg_split('/(<.+?>)/i', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
+    // Note: PHP ensures the array consists of alternating delimiters and
+    // literals and begins and ends with a literal (inserting NULL as required).
+    // Therefore, first chunk is always text:
+    $chunk_type = 'text';
+    // Tags to ignore are defined in $ignore_tags (see above).
+    // If an ignore_tag is found, it is stored here and removed only when the
+    // closing tag is found. Until the closing tag is found, no replacements are
+    // made.
+    $open_tag = '';
+
+    for ($i = 0; $i < count($chunks); $i++) {
+      if ($chunk_type == 'text') {
+        // Only do replacements when there are no unclosed ignore_tags.
+        if ($open_tag == '') {
+          // This is the high point of this function! If there is a match,
+          // a link is created in the callback function named by $task.
+          $chunks[$i] = preg_replace_callback($replacement, $task, $chunks[$i]);
+        }
+        // Done processing text chunk, so next chunk is a tag.
+        $chunk_type = 'tag';
+      }
+      else {
+        if ($open_tag == '') {
+          // No open ignore_tags. Process this tag...
+          if (preg_match("`<($ignore_tags)(?:\s|>)`i", $chunks[$i], $matches)) {
+            // This matches one of the $ignore_tags.
+            // Catch and store the tag in question.
+            $open_tag = $matches[1];
+          }
+        }
+        else {
+          // There is an $ignore_tag open. See if this is a matching closing
+          // tag.
+          // Nothing else is done until we find the closing tag.
+          if (preg_match("`<\/$open_tag>`i", $chunks[$i], $matches)) {
+            $open_tag = '';
+          }
+        }
+        // Done processing tag chunk, so next chunk is text.
+        $chunk_type = 'text';
+      }
+    }
+  $text = implode($chunks);
+  }
 
   return $text;
 }
 
 /**
- * Make links out of absolute URLs.
+ * preg_replace callback to make links out of absolute URLs.
  */
 function _filter_url_parse_full_links($match) {
-  $match[2] = decode_entities($match[2]);
-  $caption = check_plain(_filter_url_trim($match[2]));
-  $match[2] = check_url($match[2]);
-  return $match[1] . '<a href="' . $match[2] . '">' . $caption . '</a>' . $match[5];
+  // The $i:th parenthesis in the regexp contains the URL.
+  $i = 1;
+
+  $match[$i] = decode_entities($match[$i]);
+  $caption = check_plain(_filter_url_trim($match[$i]));
+  $match[$i] = check_url($match[$i]);
+  return '<a href="' . $match[$i] . '">' . $caption . '</a>' . $match[$i + 1];
+}
+
+/**
+ * preg_replace callback to make links out of e-mail addresses.
+ */
+function _filter_url_parse_email_links($match) {
+  // The $i:th parenthesis in the regexp contains the URL.
+  $i = 0;
+
+  $match[$i] = decode_entities($match[$i]);
+  $caption = check_plain(_filter_url_trim($match[$i]));
+  $match[$i] = check_plain($match[$i]);
+  return '<a href="mailto://' . $match[$i] . '">' . $caption . '</a>';
 }
 
 /**
- * Make links out of domain names starting with "www."
+ * preg_replace callback to make links out of domain names starting with "www."
  */
 function _filter_url_parse_partial_links($match) {
-  $match[2] = decode_entities($match[2]);
-  $caption = check_plain(_filter_url_trim($match[2]));
-  $match[2] = check_plain($match[2]);
-  return $match[1] . '<a href="http://' . $match[2] . '">' . $caption . '</a>' . $match[3];
+  // The $i:th parenthesis in the regexp contains the URL.
+  $i = 1;
+
+  $match[$i] = decode_entities($match[$i]);
+  $caption = check_plain(_filter_url_trim($match[$i]));
+  $match[$i] = check_plain($match[$i]);
+  return '<a href="http://' . $match[$i] . '">' . $caption . '</a>' . $match[$i + 1];
 }
 
 /**
Index: modules/filter/filter.test
===================================================================
RCS file: /cvs/drupal/drupal/modules/filter/filter.test,v
retrieving revision 1.71
diff -u -p -r1.71 filter.test
--- modules/filter/filter.test	5 Aug 2010 23:53:38 -0000	1.71
+++ modules/filter/filter.test	16 Aug 2010 16:50:31 -0000
@@ -1012,10 +1012,10 @@ class FilterUnitTestCase extends DrupalU
 
     // Converting e-mail addresses.
     $f = _filter_url('johndoe@example.com', $filter);
-    $this->assertEqual($f, '<a href="mailto:johndoe@example.com">johndoe@example.com</a>', t('Converting e-mail addresses.'));
+    $this->assertEqual($f, '<a href="mailto:johndoe@example.com" title="johndoe@example.com">johndoe@example.com</a>', t('Converting e-mail addresses.'));
 
     $f = _filter_url('aaa@sub.tv', $filter);
-    $this->assertEqual($f, '<a href="mailto:aaa@sub.tv">aaa@sub.tv</a>', t('Converting e-mail addresses -- a short e-mail from Tuvalu.'));
+    $this->assertEqual($f, '<a href="mailto:aaa@sub.tv" title="aaa@sub.tv">aaa@sub.tv</a>', t('Converting e-mail addresses -- a short e-mail from Tuvalu.'));
 
     // URL trimming.
     $filter->settings['filter_url_length'] = 28;
@@ -1062,6 +1062,114 @@ class FilterUnitTestCase extends DrupalU
   }
 
   /**
+   * Test the URL filter on a bigger text.
+   */
+  function testUrlFilterText() {
+
+    // Setup dummy filter object.
+    $filter = new stdClass;
+    $filter->settings = array(
+      'filter_url_length' => 496,
+    );
+
+$text=<<<END
+Testing wwwstring with period at end www.example1.com. Testing email with period at end person@example2.com. Testing HTTP URL with period at end http://www.example3.com. Also test <code>using www.example4.com the code tag</code>.
+
+<blockquote>
+Test inside blockquote tag www.example5.com. email with person@example6.com. and url http://www.example7.com. And also <code>using www.example8.com the code tag and also inside <em>www.example9.com em tags</em> bla bla</code>.
+</blockquote>
+
+<code>One more simple code tag test? http://www.example10.com abc</code>
+
+Test the really simple cases next:
+
+http://www.example11.com
+www.example12.com
+person@example13.com
+<code>www.example14.com</code>
+
+What about tags that don't exist <x>like x say www.example15.com</x>? And what about tag <pooh>beginning www.example16.com with p?</pooh>
+
+Test &lt;br/&gt;: This is a www.example17.com. example <strong>with</strong> some http://www.example18.com various tags within the paragraph. *<br/> Also it is important www.example19.com to *<br/> test multiple different url's and wwwstrings http://www.example20.com urls in same paragraph. *<br/>I mean it www.example21.com many of them person@example22.com after each http://www.example23.com other *img*<img/> abc. This is just a www.example24.com paragraph with some http://www.example25.com urls thrown in. This is just a www.example26.com paragraph person@example27.com with some http://www.example28.com urls thrown in.
+
+<script>
+<!--
+  //Anything inside a javascript section should not be converted
+  exampleurl = "http://www.example29.com";
+-->
+</script>
+
+Again some simple tests inside various tags:
+
+<a href="foo">http://www.example30.com</a>
+<strong>http://www.example31.com</strong>
+<em>http://www.example32.com</em>
+
+And also example ftp URL ftp://ftp.example33.com.
+
+The old URL filter has problems with <a title="kind of link www.example41.com with text" href="http://www.example42.com">this kind of link</a> with www address as part of text in title. www.example43.com
+
+<dl>
+<dt>www.example44.com</dt>
+<dd>http://www.example45.com</dd>
+<dd>person@example46.com</dd>
+<dt>check www.example47.com</dt>
+<dd>this with some text around: http://www.example48.com not so easy person@example49.com now?</dd>
+</dl>
+
+<!-- This url www.example50.com is inside a comment -->
+
+hello.... there!
+END;
+
+    $filtered_text = _filter_url($text, $filter);
+
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example1.com"'), t('Parse simple www-string but not the end-of-sentence period.'));
+    $this->assertTrue(strpos($filtered_text, 'href="mailto:person@example2.com"'), t('Parse simple email string but not the end-of-sentence period.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example3.com"'), t('Parse simple HTTP URL but not the end-of-sentence period.'));
+    $this->assertFalse(strpos($filtered_text, 'href="http://www.example4.com"'), t('Do not parse simple HTTP URL inside code tags.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example5.com"'), t('Parse www-string inside blockquote tag.'));
+    $this->assertTrue(strpos($filtered_text, 'href="mailto:person@example6.com"'), t('Parse email string inside blockquote tag.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example7.com"'), t('Parse HTTP URL inside blockquote tag'));
+    $this->assertFalse(strpos($filtered_text, 'href="http://www.example8.com"'), t('Do not parse simple HTTP URL inside code tags.'));
+    $this->assertFalse(strpos($filtered_text, 'href="http://www.example9.com"'), t('Do not parse simple HTTP URL inside em nested inside code tags.'));
+    $this->assertFalse(strpos($filtered_text, 'href="http://www.example10.com"'), t('Do not parse simple HTTP URL inside code tags.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example11.com"'), t('Parse simple HTTP URL.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example12.com"'), t('Parse simple www-string.'));
+    $this->assertTrue(strpos($filtered_text, 'href="mailto:person@example13.com"'), t('Parse simple email string.'));
+    $this->assertFalse(strpos($filtered_text, 'href="http://www.example14.com"'), t('Do not parse simple HTTP URL inside code tags.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example15.com"'), t('Parse www-string inside tag not part of HTML spec ( <x> ).'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example16.com"'), t('Parse www-string inside tag not part of HTML spec but beginning with p ( <pooh> ).'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example17.com"'), t('Parse multiple www-strings inside same paragraph.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example18.com"'), t('Parse multiple www-strings inside same paragraph.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example19.com"'), t('Parse multiple www-strings inside same paragraph.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example20.com"'), t('Parse multiple www-strings inside same paragraph limited with <br>.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example21.com"'), t('Parse multiple www-strings inside same paragraph limited with <br>.'));
+    $this->assertTrue(strpos($filtered_text, 'href="mailto:person@example22.com"'), t('Parse email string with multiple www-strings inside same paragraph limited with <br>.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example23.com"'), t('Parse multiple www-strings inside same paragraph limited with <br>.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example24.com"'), t('Parse multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example25.com"'), t('Parse multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example26.com"'), t('Parse multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertTrue(strpos($filtered_text, 'href="mailto:person@example27.com"'), t('Parse email string with multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example28.com"'), t('Parse multiple www-strings inside same paragraph limited with <br> and <img>.'));
+    $this->assertFalse(strpos($filtered_text, 'href="http://www.example29.com"'), t('Do not parse URL inside a script element (part of javascript code).'));
+    $this->assertFalse(strpos($filtered_text, 'href="http://www.example30.com"'), t('Do not parse URL inside an a element.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example31.com"'), t('Parse URL inside strong tag.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example32.com"'), t('Parse URL inside em tag.'));
+    $this->assertTrue(strpos($filtered_text, 'href="ftp://ftp.example33.com"'), t('Parse ftp:// URL.'));
+    $this->assertFalse(strpos($filtered_text, 'href="http://www.example41.com"'), t('Do not parse www-strings inside an a element title attribute.'));
+    $this->assertFalse(strpos($filtered_text, '<a href="http://www.example42.com"'), t('Do not parse URL that is already the href attribute of a link.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example44.com"'), t('Parse www-string inside dl dt tags.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example45.com"'), t('Parse URL inside dl dd tags.'));
+    $this->assertTrue(strpos($filtered_text, 'href="mailto:person@example46.com"'), t('Parse email string inside dl dd tags.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example47.com"'), t('Parse www-string with text inside dl dd tags.'));
+    $this->assertTrue(strpos($filtered_text, 'href="http://www.example48.com"'), t('Parse URL with text inside dl dd tags.'));
+    $this->assertTrue(strpos($filtered_text, 'href="mailto:person@example49.com"'), t('Parse email string with text inside dl dd tags.'));
+    $this->assertFalse(strpos($filtered_text, 'href="http://www.example50.com"'), t('Do not parse URL that is inside HTML comment.'));
+    $this->assertTrue(strpos($filtered_text, 'hello.... there!'), t('Verify that last part of normal text is preserved intact.'));
+  }
+
+  /**
    * Test the HTML corrector filter.
    *
    * @todo This test could really use some validity checking function.
