diff --git a/modules/filter/filter.module b/modules/filter/filter.module
index e9fd01d..2749ce5 100644
--- a/modules/filter/filter.module
+++ b/modules/filter/filter.module
@@ -1364,7 +1364,7 @@ function _filter_html_tips($filter, $format, $long = FALSE) {
'tr' => NULL, 'td' => NULL, 'th' => NULL,
'del' => array(t('Deleted'), '' . t('Deleted') . ''),
'ins' => array(t('Inserted'), '' . t('Inserted') . ''),
- // Assumes and describes li.
+ // Assumes and describes li.
'ol' => array(t('Ordered list - use the <li> to begin each list item'), '
- ' . t('First item') . '
- ' . t('Second item') . '
'),
'ul' => array(t('Unordered list - use the <li> to begin each list item'), ' - ' . t('First item') . '
- ' . t('Second item') . '
'),
'li' => NULL,
@@ -1476,34 +1476,46 @@ function _filter_url($text, $filter) {
$protocols = variable_get('filter_allowed_protocols', array('ftp', 'http', 'https', 'irc', 'mailto', 'news', 'nntp', 'rtsp', 'sftp', 'ssh', 'tel', 'telnet', 'webcal'));
$protocols = implode(':(?://)?|', $protocols) . ':(?://)?';
+ $valid_url_path_characters = "[\p{L}\p{M}\p{N}!\*\';:=\+,\.\$\/%#\[\]\-_~@&]";
+
+ // Allow URL paths to contain balanced parens
+ // 1. Used in Wikipedia URLs like /Primer_(film)
+ // 2. Used in IIS sessions like /S(dfd346)/
+ $valid_url_balanced_parens = '\(' . $valid_url_path_characters . '+\)';
+
+ // Valid end-of-path characters (so /foo. does not gobble the period).
+ // 1. Allow = for empty URL parameters and other URL-join artifacts
+ $valid_url_ending_characters = '[\p{L}\p{M}\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')';
+
+ $valid_url_query_chars = '[a-zA-Z0-9!?\*\'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]';
+ $valid_url_query_ending_chars = '[a-zA-Z0-9_&=#\/]';
+
+ //full path
+ //and allow @ in a url, but only in the middle. Catch things like http://example.com/@user/
+ $valid_url_path = '(?:(?:' . $valid_url_path_characters . '*(?:' . $valid_url_balanced_parens . $valid_url_path_characters . '*)*' . $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\/))';
+
// Prepare domain name pattern.
// The ICANN seems to be on track towards accepting more diverse top level
// domains, so this pattern has been "future-proofed" to allow for TLDs
// of length 2-64.
- $domain = '(?:[A-Za-z0-9._+-]+\.)?[A-Za-z]{2,64}\b';
+ $domain = '(?:[\p{L}\p{M}\p{N}._+-]+\.)?[\p{L}\p{M}]{2,64}\b';
$ip = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}';
- $auth = '[a-zA-Z0-9:%_+*~#?&=.,/;-]+@';
- $trail = '[a-zA-Z0-9:%_+*~#&\[\]=/;?!\.,-]*[a-zA-Z0-9:%_+*~#&\[\]=/;-]';
-
- // Prepare pattern for optional trailing punctuation.
- // Even these characters could have a valid meaning for the URL, such usage is
- // rare compared to using a URL at the end of or within a sentence, so these
- // trailing characters are optionally excluded.
- $punctuation = '[\.,?!]*?';
+ $auth = '[\p{L}\p{M}\p{N}:%_+*~#?&=.,/;-]+@';
+ $trail = '(' . $valid_url_path . '*)?(\\?' . $valid_url_query_chars . '*' . $valid_url_query_ending_chars . ')?';
// Match absolute URLs.
$url_pattern = "(?:$auth)?(?:$domain|$ip)/?(?:$trail)?";
- $pattern = "`((?:$protocols)(?:$url_pattern))($punctuation)`";
+ $pattern = "`((?:$protocols)(?:$url_pattern))`u";
$tasks['_filter_url_parse_full_links'] = $pattern;
- // Match e-mail addresses.
- $url_pattern = "[A-Za-z0-9._+-]{1,254}@(?:$domain)";
- $pattern = "`($url_pattern)`";
+ // Match email addresses.
+ $url_pattern = "[\p{L}\p{M}\p{N}._+-]{1,254}@(?:$domain)";
+ $pattern = "`($url_pattern)`u";
$tasks['_filter_url_parse_email_links'] = $pattern;
// Match www domains.
$url_pattern = "www\.(?:$domain)/?(?:$trail)?";
- $pattern = "`($url_pattern)($punctuation)`";
+ $pattern = "`($url_pattern)`u";
$tasks['_filter_url_parse_partial_links'] = $pattern;
// Each type of URL needs to be processed separately. The text is joined and
@@ -1558,7 +1570,7 @@ function _filter_url($text, $filter) {
}
$text = implode($chunks);
- // Revert back to the original comment contents
+ // Revert to the original comment contents
_filter_url_escape_comments('', FALSE);
$text = preg_replace_callback('``', '_filter_url_escape_comments', $text);
}
@@ -1578,7 +1590,7 @@ function _filter_url_parse_full_links($match) {
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
- return '' . $caption . '' . $match[$i + 1];
+ return '' . $caption . '';
}
/**
@@ -1608,7 +1620,7 @@ function _filter_url_parse_partial_links($match) {
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
- return '' . $caption . '' . $match[$i + 1];
+ return '' . $caption . '';
}
/**
diff --git a/modules/filter/filter.test b/modules/filter/filter.test
index 34dcf04..1f0d98a 100644
--- a/modules/filter/filter.test
+++ b/modules/filter/filter.test
@@ -1325,6 +1325,7 @@ function testUrlFilter() {
http://trailingslash.com/ or www.trailingslash.com/
http://host.com/some/path?query=foo&bar[baz]=beer#fragment or www.host.com/some/path?query=foo&bar[baz]=beer#fragment
http://twitter.com/#!/example/status/22376963142324226
+http://example.com/@user/
ftp://user:pass@ftp.example.com/~home/dir1
sftp://user@nonstandardport:222/dir
ssh://192.168.0.100/srv/git/drupal.git
@@ -1334,10 +1335,29 @@ function testUrlFilter() {
'http://host.com/some/path?query=foo&bar[baz]=beer#fragment' => TRUE,
'www.host.com/some/path?query=foo&bar[baz]=beer#fragment' => TRUE,
'http://twitter.com/#!/example/status/22376963142324226' => TRUE,
+ 'http://example.com/@user/' => TRUE,
'ftp://user:pass@ftp.example.com/~home/dir1' => TRUE,
'sftp://user@nonstandardport:222/dir' => TRUE,
'ssh://192.168.0.100/srv/git/drupal.git' => TRUE,
),
+ // International Unicode characters.
+ '
+http://пример.испытание/
+http://مثال.إختبار/
+http://例子.測試/
+http://12345.中国/
+http://例え.テスト/
+http://dréißig-bücher.de/
+http://méxico-mañana.es/
+' => array(
+ 'http://пример.испытание/' => TRUE,
+ 'http://مثال.إختبار/' => TRUE,
+ 'http://例子.測試/' => TRUE,
+ 'http://12345.中国/' => TRUE,
+ 'http://例え.テスト/' => TRUE,
+ 'http://dréißig-bücher.de/' => TRUE,
+ 'http://méxico-mañana.es/' => TRUE,
+ ),
// Encoding.
'
http://ampersand.com/?a=1&b=2
@@ -1396,6 +1416,9 @@ function testUrlFilter() {
Partial URL with 3 trailing www.partial.periods...
E-mail with 3 trailing exclamations@example.com!!!
Absolute URL and query string with 2 different punctuation characters (http://www.example.com/q=abc).
+Partial URL with brackets in the URL as well as surrounded brackets (www.foo.com/more_(than)_one_(parens)).
+Absolute URL with square brackets in the URL as well as surrounded brackets [http://www.drupal.org/?class[]=1]
+Absolute URL with quotes "http://www.drupal.org/sample"
' => array(
'period www.partial.com.' => TRUE,
'comma person@example.com,' => TRUE,
@@ -1404,6 +1427,9 @@ function testUrlFilter() {
'trailing www.partial.periods...' => TRUE,
'trailing exclamations@example.com!!!' => TRUE,
'characters (http://www.example.com/q=abc).' => TRUE,
+ 'brackets (www.foo.com/more_(than)_one_(parens)).' => TRUE,
+ 'brackets [http://www.drupal.org/?class[]=1]' => TRUE,
+ 'quotes "http://www.drupal.org/sample"' => TRUE,
),
'
(www.parenthesis.com/dir?a=1&b=2#a)
diff --git a/modules/filter/tests/filter.url-input.txt b/modules/filter/tests/filter.url-input.txt
index 7b33af5..92289dc 100644
--- a/modules/filter/tests/filter.url-input.txt
+++ b/modules/filter/tests/filter.url-input.txt
@@ -10,6 +10,9 @@ http://www.test.com
www.test.com
person@test.com
www.test.com
+http://test.com/?search=test
+http://test.com/?search=Test
+http://test.com/?search=tesT
What about tags that don't exist like x say www.test.com? And what about tag beginning www.test.com with p?
diff --git a/modules/filter/tests/filter.url-output.txt b/modules/filter/tests/filter.url-output.txt
index 9cc5073..814a4ed 100644
--- a/modules/filter/tests/filter.url-output.txt
+++ b/modules/filter/tests/filter.url-output.txt
@@ -10,6 +10,9 @@ This is just a www.test.com. paragraph with www.test.com
person@test.com
www.test.com
+http://test.com/?search=test
+http://test.com/?search=Test
+http://test.com/?search=tesT
What about tags that don't exist like x say www.test.com? And what about tag beginning www.test.com with p?