diff --git a/core/modules/filter/filter.module b/core/modules/filter/filter.module
index 857a33b39285a0ceedaed3e5264d23ebd9fa23ab..d9813c7c6f069453fd1d7f37440840a69ab8e028 100644
--- a/core/modules/filter/filter.module
+++ b/core/modules/filter/filter.module
@@ -1086,34 +1086,49 @@ function _filter_url($text, $filter) {
$protocols = config('system.filter')->get('protocols');
$protocols = implode(':(?://)?|', $protocols) . ':(?://)?';
+ //trail regex below thanks to Twitter https://github.com/twitter/twitter-text-js and https://github.com/stephenbeckett/TwitterURLMatchPHP
+
+ $valid_url_path_characters = "[\p{L}\p{M}\p{N}!\*\';:=\+,\.\$\/%#\[\]\-_~@&]";
+
+ // Allow URL paths to contain balanced parens
+ // 1. Used in Wikipedia URLs like /Primer_(film)
+ // 2. Used in IIS sessions like /S(dfd346)/
+ $valid_url_balanced_parens = '\('. $valid_url_path_characters . '+\)';
+
+ // Valid end-of-path chracters (so /foo. does not gobble the period).
+ // 1. Allow = for empty URL parameters and other URL-join artifacts
+ $valid_url_ending_characters = '[\p{L}\p{M}\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')';
+
+ $valid_url_query_chars = '[a-z0-9!?\*\'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]';
+ $valid_url_query_ending_chars = '[a-z0-9_&=#\/]';
+
+ //full path
+ //and allow @ in a url, but only in the middle. Catch things like http://example.com/@user/
+ $valid_url_path = '(?:(?:'.$valid_url_path_characters . '*(?:'.$valid_url_balanced_parens .$valid_url_path_characters . '*)*'. $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\/))';
+
// Prepare domain name pattern.
// The ICANN seems to be on track towards accepting more diverse top level
// domains, so this pattern has been "future-proofed" to allow for TLDs
// of length 2-64.
- $domain = '(?:[A-Za-z0-9._+-]+\.)?[A-Za-z]{2,64}\b';
+ $domain = '(?:[\p{L}\p{M}\p{N}._+-]+\.)?[\p{L}\p{M}]{2,64}\b';
$ip = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}';
- $auth = '[a-zA-Z0-9:%_+*~#?&=.,/;-]+@';
- $trail = '[a-zA-Z0-9:%_+*~#&\[\]=/;?!\.,-]*[a-zA-Z0-9:%_+*~#&\[\]=/;-]';
+ $auth = '[\p{L}\p{M}\p{N}:%_+*~#?&=.,/;-]+@';
+ $trail = '('.$valid_url_path.'*)?(\\?'.$valid_url_query_chars .'*'.$valid_url_query_ending_chars.')?';
- // Prepare pattern for optional trailing punctuation.
- // Even these characters could have a valid meaning for the URL, such usage is
- // rare compared to using a URL at the end of or within a sentence, so these
- // trailing characters are optionally excluded.
- $punctuation = '[\.,?!]*?';
// Match absolute URLs.
$url_pattern = "(?:$auth)?(?:$domain|$ip)/?(?:$trail)?";
- $pattern = "`((?:$protocols)(?:$url_pattern))($punctuation)`";
+ $pattern = "`((?:$protocols)(?:$url_pattern))`u";
$tasks['_filter_url_parse_full_links'] = $pattern;
// Match e-mail addresses.
- $url_pattern = "[A-Za-z0-9._-]{1,254}@(?:$domain)";
- $pattern = "`($url_pattern)`";
+ $url_pattern = "[\p{L}\p{M}\p{N}._-]{1,254}@(?:$domain)";
+ $pattern = "`($url_pattern)`u";
$tasks['_filter_url_parse_email_links'] = $pattern;
// Match www domains.
$url_pattern = "www\.(?:$domain)/?(?:$trail)?";
- $pattern = "`($url_pattern)($punctuation)`";
+ $pattern = "`($url_pattern)`u";
$tasks['_filter_url_parse_partial_links'] = $pattern;
// Each type of URL needs to be processed separately. The text is joined and
@@ -1188,7 +1203,7 @@ function _filter_url_parse_full_links($match) {
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
- return '' . $caption . '' . $match[$i + 1];
+ return '' . $caption . '';
}
/**
@@ -1218,7 +1233,7 @@ function _filter_url_parse_partial_links($match) {
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
- return '' . $caption . '' . $match[$i + 1];
+ return '' . $caption . '';
}
/**