diff --git a/core/modules/filter/filter.module b/core/modules/filter/filter.module index 857a33b39285a0ceedaed3e5264d23ebd9fa23ab..d9813c7c6f069453fd1d7f37440840a69ab8e028 100644 --- a/core/modules/filter/filter.module +++ b/core/modules/filter/filter.module @@ -1086,34 +1086,49 @@ function _filter_url($text, $filter) { $protocols = config('system.filter')->get('protocols'); $protocols = implode(':(?://)?|', $protocols) . ':(?://)?'; + //trail regex below thanks to Twitter https://github.com/twitter/twitter-text-js and https://github.com/stephenbeckett/TwitterURLMatchPHP + + $valid_url_path_characters = "[\p{L}\p{M}\p{N}!\*\';:=\+,\.\$\/%#\[\]\-_~@&]"; + + // Allow URL paths to contain balanced parens + // 1. Used in Wikipedia URLs like /Primer_(film) + // 2. Used in IIS sessions like /S(dfd346)/ + $valid_url_balanced_parens = '\('. $valid_url_path_characters . '+\)'; + + // Valid end-of-path chracters (so /foo. does not gobble the period). + // 1. Allow =&# for empty URL parameters and other URL-join artifacts + $valid_url_ending_characters = '[\p{L}\p{M}\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')'; + + $valid_url_query_chars = '[a-z0-9!?\*\'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]'; + $valid_url_query_ending_chars = '[a-z0-9_&=#\/]'; + + //full path + //and allow @ in a url, but only in the middle. Catch things like http://example.com/@user/ + $valid_url_path = '(?:(?:'.$valid_url_path_characters . '*(?:'.$valid_url_balanced_parens .$valid_url_path_characters . '*)*'. $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\/))'; + // Prepare domain name pattern. // The ICANN seems to be on track towards accepting more diverse top level // domains, so this pattern has been "future-proofed" to allow for TLDs // of length 2-64. - $domain = '(?:[A-Za-z0-9._+-]+\.)?[A-Za-z]{2,64}\b'; + $domain = '(?:[\p{L}\p{M}\p{N}._+-]+\.)?[\p{L}\p{M}]{2,64}\b'; $ip = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}'; - $auth = '[a-zA-Z0-9:%_+*~#?&=.,/;-]+@'; - $trail = '[a-zA-Z0-9:%_+*~#&\[\]=/;?!\.,-]*[a-zA-Z0-9:%_+*~#&\[\]=/;-]'; + $auth = '[\p{L}\p{M}\p{N}:%_+*~#?&=.,/;-]+@'; + $trail = '('.$valid_url_path.'*)?(\\?'.$valid_url_query_chars .'*'.$valid_url_query_ending_chars.')?'; - // Prepare pattern for optional trailing punctuation. - // Even these characters could have a valid meaning for the URL, such usage is - // rare compared to using a URL at the end of or within a sentence, so these - // trailing characters are optionally excluded. - $punctuation = '[\.,?!]*?'; // Match absolute URLs. $url_pattern = "(?:$auth)?(?:$domain|$ip)/?(?:$trail)?"; - $pattern = "`((?:$protocols)(?:$url_pattern))($punctuation)`"; + $pattern = "`((?:$protocols)(?:$url_pattern))`u"; $tasks['_filter_url_parse_full_links'] = $pattern; // Match e-mail addresses. - $url_pattern = "[A-Za-z0-9._-]{1,254}@(?:$domain)"; - $pattern = "`($url_pattern)`"; + $url_pattern = "[\p{L}\p{M}\p{N}._-]{1,254}@(?:$domain)"; + $pattern = "`($url_pattern)`u"; $tasks['_filter_url_parse_email_links'] = $pattern; // Match www domains. $url_pattern = "www\.(?:$domain)/?(?:$trail)?"; - $pattern = "`($url_pattern)($punctuation)`"; + $pattern = "`($url_pattern)`u"; $tasks['_filter_url_parse_partial_links'] = $pattern; // Each type of URL needs to be processed separately. The text is joined and @@ -1188,7 +1203,7 @@ function _filter_url_parse_full_links($match) { $match[$i] = decode_entities($match[$i]); $caption = check_plain(_filter_url_trim($match[$i])); $match[$i] = check_plain($match[$i]); - return '' . $caption . '' . $match[$i + 1]; + return '' . $caption . ''; } /** @@ -1218,7 +1233,7 @@ function _filter_url_parse_partial_links($match) { $match[$i] = decode_entities($match[$i]); $caption = check_plain(_filter_url_trim($match[$i])); $match[$i] = check_plain($match[$i]); - return '' . $caption . '' . $match[$i + 1]; + return '' . $caption . ''; } /**