/** * Verify the syntax of the given URL. * * This function should only be used on actual URLs. It should not be used for * Drupal menu paths, which can contain arbitrary characters. * * @param $url * The URL to verify. * @param $options * An associative array of additional options, with the following keys: * - 'allow_relative' (default FALSE) * Whether relative paths are allowed * - 'allow_ipv6' (default FALSE) * Whether the host may be an bracket-enclosed IPv6 address, e.g. '[2001:0db8::1428:57ab]' * - 'allow_idna' (default FALSE) * Whether the host may be an IDNA hostname, i.e. contain non-ASCII characters * - 'allowed_schemes' (default 'drupal') * Either an array of allowed schemes (in lowercase), e.g. ('http', 'https'), * or FALSE to allow any scheme. * If the array contains one or more of these pseudo-schemes (beginning with "%"), * they are expanded to the following: * '%drupal' => ('http', 'https') - protocols supported by drupal_http_request() * '%browser' => ('http', 'https', 'gopher', 'ftp', 'data') - protocols natively supported by most browsers * '%media' => ('mms', 'rtsp') - stream media protocols * '%external' => ('news', 'nntp', 'telnet', 'mailto', 'irc', 'ssh', 'sftp', 'feed', 'webcal') - protocols supported by non-browser applications * - 'forbidden_ports' (default - see the source code) * Ports that are forbidden due to security issues * @return * TRUE if the URL is in a valid format. */ function valid_url($url, array $options = array()) { static $default_options = array( 'allow_relative' => FALSE, 'allow_ipv6' => FALSE, 'allow_idna' => FALSE, 'allowed_schemes' => array('%drupal'), // See http://www.mozilla.org/projects/netlib/PortBanning.html 'forbidden_ports' => array(1, 7, 9, 11, 13, 15, 17, 19, 20, 22, 23, 25, 37, 42, 43, 53, 77, 79, 87, 95, 101, 102, 103, 104, 109, 110, 111, 113, 115, 117, 119, 123, 135, 139, 143, 179, 389, 465, 512, 513, 514, 515, 526, 530, 531, 532, 540, 556, 563, 587, 601, 636, 993, 995, 2049, 4045, 6000), ); $options = array_merge($default_options, $options); preg_match('`^ (?:([^:/?\#]+):)? # scheme (?:// ([^@]*@)? # userinfo "@" (\[[^/?\#\]]*\] # host (IPv6address/IPvFuture) |[^/?\#:]*) # host (IPv4address/reg-name) (:[^/?\#:]*)? # port )? ([^?\#]*) # path (\?[^\#]*)? # "?" query (\#.*)? # "#" fragment `xs', $url, $reg); $scheme = $reg[1]; $userinfo = empty($reg[2]) ? FALSE : substr($reg[2], 0, -1); $host = $reg[3]; $port = empty($reg[4]) ? FALSE : strval(substr($reg[4], 1)); $path = $reg[5]; $query = empty($reg[6]) ? FALSE : strval(substr($reg[6], 1)); $fragment = empty($reg[7]) ? FALSE : strval(substr($reg[7], 1)); if (!$scheme && !$options['allow_relative']) { return FALSE; } if (substr($host, 1) == '[') { if (!$options['allow_ipv6']) { return FALSE; } else { // TODO: Verify IPv6address - for now we just allow anything } } if ($options['allow_idna']) { // TODO: Verify IDNA hostname - for now we just allow anything } else { // Look for invalid characters, hyphens or periods first or last, hyphens next to periods, or consecutive periods if (preg_match('`[^0-9a-z.-]|^[-.]|[-.]$|\.-|-\.|\.\.`i', $host)) { return FALSE; } } if ($options['allowed_schemes']) { static $shortcuts = array( '%drupal' => array('http', 'https'), '%browser' => array('http', 'https', 'gopher', 'ftp', 'data'), '%media' => array('mms', 'rtsp'), '%external' => array('news', 'nntp', 'telnet', 'mailto', 'irc', 'ssh', 'sftp', 'feed', 'webcal') ); foreach ($shortcuts as $name => $shortcut) { if (in_array($name, $options['allowed_schemes'])) { $options['allowed_schemes'] = array_merge($options['allowed_schemes'], $shortcut); } } } if ($scheme && $options['allowed_schemes'] && !in_array(strtolower($scheme), $options['allowed_schemes'])) { return FALSE; } // Look for invalid characters foreach (array('userinfo', 'path', 'query', 'fragment') as $part) { // According to RFC 3986 appendix X, the following characters are allowed: // * unreserved: ALPHA / DIGIT / "-" / "." / "_" / "~" // * pct-encoded: "%" HEXDIG HEXDIG // * sub-delims: "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" // * ":" / "@" / "/" / "?" // // Not all the latter are allowed in all the mentioned parts, but the // invalid (e.g. "?" in $path) are handled when the URL was initially // parsed above. // // We also allow the following that are commonly used: "[" "]" if (preg_match('`[^a-z0-9\-._~%!$&\'()*+,;=/?:@[\]]`i', $$part, $reg)) { var_dump($reg); return FALSE; } } // Look for invalid percent encoding foreach (array('userinfo', 'host', 'path') as $part) { if (preg_match('`%(.?[^0-9a-f])`i', $$part)) { return FALSE; } } // Make sure port is integer or the empty, and not forbidden if (!preg_match('`^\d*$`', $port) || in_array($port, $options['forbidden_ports'])) { return FALSE; } return TRUE; }