diff --git a/core/lib/Drupal/Component/Utility/String.php b/core/lib/Drupal/Component/Utility/String.php index 970436c..761d1fe 100644 --- a/core/lib/Drupal/Component/Utility/String.php +++ b/core/lib/Drupal/Component/Utility/String.php @@ -50,6 +50,11 @@ public static function checkPlain($text) { * The input $text, with all HTML entities decoded once. */ public static function decodeEntities($text) { + $class = get_called_class(); + $decoder = function ($matches) use ($class) { + return $class::decode($matches, $class); + }; + $text = preg_replace_callback('/&#([0-9]+);|&#x([0-9a-f]+);/i', $decoder, $text); return html_entity_decode($text, ENT_QUOTES, 'UTF-8'); } @@ -132,5 +137,129 @@ public static function placeholder($text) { return SafeMarkup::set('' . static::checkPlain($text) . ''); } + /** + * Decodes all not valid HTML numeric entities to regular UTF-8 bytes. + * ENT_HTML5 could be replaced with desired flag + * + * @param array $match + * match[1] for decimal and match[2] for hex. + * + * @param string $class + * The called class. This method is called from an anonymous function which + * breaks late static binding. See https://bugs.php.net/bug.php?id=66622 for + * more information. + * + * @return string + * The input $text, with all HTML entities decoded once. + */ + protected static function decode($match, $class) { + $code = 0; + if (isset($match[2])) { + $code = hexdec($match[2]); + } + else { + $code = intval($match[1]); + } + if ($code && !$class::unicodeCpIsAllowed($code, ENT_HTML5)) { + return $class::utf8($code); + } + return $match[0]; + } + + /** + * Packs code point to UTF-8 symbol + * + * @param Integer $k + * code point of symbol. + * + * @return string + * Converted symbol. + */ + protected static function utf8($k) { + $buf = array(); + if ($k < 0x80) { + $buf[0] = $k; + } + elseif ($k < 0x800) { + $buf[0] = 0xc0 | ($k >> 6); + $buf[1] = 0x80 | ($k & 0x3f); + } + elseif ($k < 0x10000) { + $buf[0] = 0xe0 | ($k >> 12); + $buf[1] = 0x80 | (($k >> 6) & 0x3f); + $buf[2] = 0x80 | ($k & 0x3f); + } + else { + $buf[0] = 0xf0 | ($k >> 18); + $buf[1] = 0x80 | (($k >> 12) & 0x3f); + $buf[2] = 0x80 | (($k >> 6) & 0x3f); + $buf[3] = 0x80 | ($k & 0x3f); + } + $result = array(); + foreach ($buf as $charcode) { + $result[] = chr($charcode); + } + return implode($result); + } + /** + * XML 1.0 HTML 4.01 HTML 5 + * 0x09..0x0A 0x09..0x0A 0x09..0x0A + * 0x0D 0x0D 0x0C..0x0D + * 0x0020..0xD7FF 0x20..0x7E 0x20..0x7E + * 0x00A0..0xD7FF 0x00A0..0xD7FF + * 0xE000..0xFFFD 0xE000..0x10FFFF 0xE000..0xFDCF + * 0x010000..0x10FFFF 0xFDF0..0x10FFFF (*) + * + * (*) exclude code points where ((code & 0xFFFF) >= 0xFFFE) + * + * References: + * XML 1.0: + * HTML 4.01: + * HTML 5: + * + * Not sure this is the relevant part for HTML 5, though. I opted to + * disallow the characters that would result in a parse error when + * preprocessing of the input stream. See also section 8.1.3. + * + * It's unclear if XHTML 1.0 allows C1 characters. I'll opt to apply to + * XHTML 1.0 the same rules as for XML 1.0. + * See . + * + * @param Integer $uni_cp + * code point value of symbol. + * + * @param Constant $document_type + * predefined document type in PHP. + * + * @return Boolean + * Returns TRUE if code point allowed. + */ + + protected static function unicodeCpIsAllowed($uni_cp, $document_type) { + switch ($document_type) { + case ENT_HTML401: + return ($uni_cp >= 0x20 && $uni_cp <= 0x7E) || + ($uni_cp == 0x0A || $uni_cp == 0x09 || $uni_cp == 0x0D) || + ($uni_cp >= 0xA0 && $uni_cp <= 0xD7FF) || + ($uni_cp >= 0xE000 && $uni_cp <= 0x10FFFF); + + case ENT_HTML5: + return ($uni_cp >= 0x20 && $uni_cp <= 0x7E) || + ($uni_cp >= 0x09 && $uni_cp <= 0x0D && $uni_cp != 0x0B) || /* form feed U+0C allowed */ + ($uni_cp >= 0xA0 && $uni_cp <= 0xD7FF) || + ($uni_cp >= 0xE000 && $uni_cp <= 0x10FFFF && + (($uni_cp & 0xFFFF) < 0xFFFE) && /* last two of each plane (nonchars) disallowed */ + ($uni_cp < 0xFDD0 || $uni_cp > 0xFDEF)); /* U+FDD0-U+FDEF (nonchars) disallowed */ + + case ENT_XHTML: + case ENT_XML1: + return ($uni_cp >= 0x20 && $uni_cp <= 0xD7FF) || + ($uni_cp == 0x0A || $uni_cp == 0x09 || $uni_cp == 0x0D) || + ($uni_cp >= 0xE000 && $uni_cp <= 0x10FFFF && $uni_cp != 0xFFFE && $uni_cp != 0xFFFF); + + default: + return 1; + } + } } diff --git a/core/modules/editor/tests/src/Unit/EditorXssFilter/StandardTest.php b/core/modules/editor/tests/src/Unit/EditorXssFilter/StandardTest.php index 29209b6..556dbf2 100644 --- a/core/modules/editor/tests/src/Unit/EditorXssFilter/StandardTest.php +++ b/core/modules/editor/tests/src/Unit/EditorXssFilter/StandardTest.php @@ -147,9 +147,7 @@ public function providerTestFilterXss() { // @see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#Spaces_and_meta_chars_before_the_JavaScript_in_images_for_XSS // @fixme This dataset currently fails under 5.4 because of // https://drupal.org/node/1210798. Restore after it's fixed. - if (version_compare(PHP_VERSION, '5.4.0', '<')) { - $data[] = array('', ''); - } + $data[] = array('', ''); // Non-alpha-non-digit XSS. // @see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#Non-alpha-non-digit_XSS