diff --git a/core/includes/common.inc b/core/includes/common.inc index 88d6fd3..515a2fc 100644 --- a/core/includes/common.inc +++ b/core/includes/common.inc @@ -11,6 +11,7 @@ use Drupal\Core\Database\Database; use Drupal\Core\SystemListingInfo; use Drupal\Core\Template\Attribute; +use Drupal\Component\Utility\Unicode; /** * @file @@ -1548,11 +1549,7 @@ function filter_xss_bad_protocol($string, $decode = TRUE) { // @todo Remove the $decode parameter in Drupal 8, and always assume an HTML // string that needs decoding. if ($decode) { - if (!function_exists('decode_entities')) { - require_once DRUPAL_ROOT . '/core/includes/unicode.inc'; - } - - $string = decode_entities($string); + $string = Unicode::decodeEntities($string); } return check_plain(drupal_strip_dangerous_protocols($string)); } @@ -1582,7 +1579,7 @@ function format_rss_channel($title, $link, $description, $items, $langcode = NUL // The RSS 2.0 "spec" doesn't indicate HTML can be used in the description. // We strip all HTML tags, but need to prevent double encoding from properly // escaped source data (such as & becoming &amp;). - $output .= ' ' . check_plain(decode_entities(strip_tags($description))) . "\n"; + $output .= ' ' . check_plain(Unicode::decodeEntities(strip_tags($description))) . "\n"; $output .= ' ' . check_plain($langcode) . "\n"; $output .= format_xml_elements($args); $output .= $items; @@ -3414,7 +3411,7 @@ function drupal_clean_css_identifier($identifier, $filter = array(' ' => '-', '_ * The cleaned class name. */ function drupal_html_class($class) { - return drupal_clean_css_identifier(drupal_strtolower($class)); + return drupal_clean_css_identifier(Unicode::strtolower($class)); } /** @@ -3488,7 +3485,7 @@ function drupal_html_id($id) { } $seen_ids = &drupal_static(__FUNCTION__, $seen_ids_init); - $id = strtr(drupal_strtolower($id), array(' ' => '-', '_' => '-', '[' => '-', ']' => '')); + $id = strtr(Unicode::strtolower($id), array(' ' => '-', '_' => '-', '[' => '-', ']' => '')); // As defined in http://www.w3.org/TR/html4/types.html#type-name, HTML IDs can // only contain letters, digits ([0-9]), hyphens ("-"), underscores ("_"), @@ -4866,7 +4863,6 @@ function _drupal_bootstrap_code() { require_once DRUPAL_ROOT . '/' . settings()->get('menu_inc', 'core/includes/menu.inc'); require_once DRUPAL_ROOT . '/core/includes/tablesort.inc'; require_once DRUPAL_ROOT . '/core/includes/file.inc'; - require_once DRUPAL_ROOT . '/core/includes/unicode.inc'; require_once DRUPAL_ROOT . '/core/includes/image.inc'; require_once DRUPAL_ROOT . '/core/includes/form.inc'; require_once DRUPAL_ROOT . '/core/includes/mail.inc'; diff --git a/core/includes/file.inc b/core/includes/file.inc index 5f54190..ad9c062 100644 --- a/core/includes/file.inc +++ b/core/includes/file.inc @@ -6,6 +6,7 @@ */ use Drupal\Core\StreamWrapper\LocalStream; +use Drupal\Component\Utility\Unicode; use Drupal\Component\PhpStorage\MTimeProtectedFastFileStorage; use Symfony\Component\HttpKernel\Exception\AccessDeniedHttpException; use Symfony\Component\HttpKernel\Exception\NotFoundHttpException; @@ -445,7 +446,7 @@ function file_create_url($uri) { // HTTP and to https://example.com/bar.jpg when viewing a HTTPS page) // Both types of relative URIs are characterized by a leading slash, hence // we can use a single check. - if (drupal_substr($uri, 0, 1) == '/') { + if (Unicode::substr($uri, 0, 1) == '/') { return $uri; } else { diff --git a/core/includes/form.inc b/core/includes/form.inc index 3ebaca1..69ef376 100644 --- a/core/includes/form.inc +++ b/core/includes/form.inc @@ -6,6 +6,7 @@ */ use Drupal\Component\Utility\NestedArray; +use Drupal\Component\Utility\Unicode; use Drupal\Core\Form\FormInterface; use Drupal\Core\Database\Database; use Drupal\Core\Template\Attribute; @@ -1366,8 +1367,8 @@ function _form_validate(&$elements, &$form_state, $form_id = NULL) { // The following errors are always shown. if (isset($elements['#needs_validation'])) { // Verify that the value is not longer than #maxlength. - if (isset($elements['#maxlength']) && drupal_strlen($elements['#value']) > $elements['#maxlength']) { - form_error($elements, $t('!name cannot be longer than %max characters but is currently %length characters long.', array('!name' => empty($elements['#title']) ? $elements['#parents'][0] : $elements['#title'], '%max' => $elements['#maxlength'], '%length' => drupal_strlen($elements['#value'])))); + if (isset($elements['#maxlength']) && Unicode::strlen($elements['#value']) > $elements['#maxlength']) { + form_error($elements, $t('!name cannot be longer than %max characters but is currently %length characters long.', array('!name' => empty($elements['#title']) ? $elements['#parents'][0] : $elements['#title'], '%max' => $elements['#maxlength'], '%length' => Unicode::strlen($elements['#value'])))); } if (isset($elements['#options']) && isset($elements['#value'])) { @@ -1447,7 +1448,7 @@ function _form_validate(&$elements, &$form_state, $form_id = NULL) { // An unchecked checkbox has a #value of integer 0, different than string // '0', which could be a valid value. $is_empty_multiple = (!count($elements['#value'])); - $is_empty_string = (is_string($elements['#value']) && drupal_strlen(trim($elements['#value'])) == 0); + $is_empty_string = (is_string($elements['#value']) && Unicode::strlen(trim($elements['#value'])) == 0); $is_empty_value = ($elements['#value'] === 0); if ($is_empty_multiple || $is_empty_string || $is_empty_value) { // Flag this element as #required_but_empty to allow #element_validate diff --git a/core/includes/install.inc b/core/includes/install.inc index 3fdc48c..258f971 100644 --- a/core/includes/install.inc +++ b/core/includes/install.inc @@ -5,6 +5,7 @@ * API functions for installing modules and themes. */ +use Drupal\Component\Utility\Unicode; use Drupal\Core\Database\Database; use Drupal\Core\DrupalKernel; use Drupal\locale\Gettext; @@ -594,7 +595,7 @@ function drupal_verify_profile($install_state) { if (count($missing_modules)) { $modules = array(); foreach ($missing_modules as $module) { - $modules[] = '' . drupal_ucfirst($module) . ''; + $modules[] = '' . Unicode::ucfirst($module) . ''; } $requirements['required_modules'] = array( 'title' => st('Required modules'), diff --git a/core/includes/mail.inc b/core/includes/mail.inc index c9ff601..1147625 100644 --- a/core/includes/mail.inc +++ b/core/includes/mail.inc @@ -5,6 +5,8 @@ * API functions for processing and sending e-mail. */ +use Drupal\Component\Utility\Unicode; + /** * Auto-detect appropriate line endings for e-mails. * @@ -463,11 +465,11 @@ function drupal_html_to_text($string, $allowed_tags = NULL) { // Fancy headers case 'h1': $indent[] = '======== '; - $casing = 'drupal_strtoupper'; + $casing = 'Unicode::strtoupper'; break; case 'h2': $indent[] = '-------- '; - $casing = 'drupal_strtoupper'; + $casing = 'Unicode::strtoupper'; break; case '/h1': case '/h2': @@ -496,8 +498,8 @@ function drupal_html_to_text($string, $allowed_tags = NULL) { else { // Convert inline HTML text to plain text; not removing line-breaks or // white-space, since that breaks newlines when sanitizing plain-text. - $value = trim(decode_entities($value)); - if (drupal_strlen($value)) { + $value = trim(Unicode::decodeEntities($value)); + if (Unicode::strlen($value)) { $chunk = $value; } } diff --git a/core/includes/menu.inc b/core/includes/menu.inc index d719571..13fd258 100644 --- a/core/includes/menu.inc +++ b/core/includes/menu.inc @@ -8,6 +8,7 @@ use Symfony\Component\HttpFoundation\Request; use Drupal\Component\Utility\NestedArray; +use Drupal\Component\Utility\Unicode; use Drupal\Core\Cache\CacheBackendInterface; use Drupal\Core\Template\Attribute; use Drupal\menu_link\Plugin\Core\Entity\MenuLink; @@ -2141,11 +2142,11 @@ function menu_contextual_links($module, $parent_path, $args) { ->execute() ->fetchAllAssoc('path', PDO::FETCH_ASSOC); } - $parent_length = drupal_strlen($root_path) + 1; + $parent_length = Unicode::strlen($root_path) + 1; $map = $router_item['original_map']; foreach ($data[$root_path] as $item) { // Extract the actual "task" string from the path argument. - $key = drupal_substr($item['path'], $parent_length); + $key = Unicode::substr($item['path'], $parent_length); // Denormalize and translate the contextual link. _menu_translate($item, $map, TRUE); diff --git a/core/includes/theme.maintenance.inc b/core/includes/theme.maintenance.inc index 37775e6..a9b7226 100644 --- a/core/includes/theme.maintenance.inc +++ b/core/includes/theme.maintenance.inc @@ -25,7 +25,6 @@ function _drupal_maintenance_theme() { require_once DRUPAL_ROOT . '/' . settings()->get('path_inc', 'core/includes/path.inc'); require_once DRUPAL_ROOT . '/core/includes/theme.inc'; require_once DRUPAL_ROOT . '/core/includes/common.inc'; - require_once DRUPAL_ROOT . '/core/includes/unicode.inc'; require_once DRUPAL_ROOT . '/core/includes/file.inc'; require_once DRUPAL_ROOT . '/core/includes/module.inc'; unicode_check(); diff --git a/core/lib/Drupal/Component/Diff/DiffEngine.php b/core/lib/Drupal/Component/Diff/DiffEngine.php index f426b96..fd4d63b 100644 --- a/core/lib/Drupal/Component/Diff/DiffEngine.php +++ b/core/lib/Drupal/Component/Diff/DiffEngine.php @@ -8,6 +8,8 @@ * You may copy this code freely under the conditions of the GPL. */ +use Drupal\Component\Utility\Unicode; + define('USE_ASSERTS', FALSE); /** @@ -238,7 +240,7 @@ function diff($from_lines, $to_lines) { * Returns the whole line if it's small enough, or the MD5 hash otherwise. */ function _line_hash($line) { - if (drupal_strlen($line) > $this->MAX_XREF_LENGTH()) { + if (Unicode::strlen($line) > $this->MAX_XREF_LENGTH()) { return md5($line); } else { @@ -993,7 +995,7 @@ function addWords($words, $tag = '') { } if ($word[0] == "\n") { $this->_flushLine($tag); - $word = drupal_substr($word, 1); + $word = Unicode::substr($word, 1); } assert(!strstr($word, "\n")); $this->_group .= $word; @@ -1037,7 +1039,7 @@ function _split($lines) { $words[] = "\n"; $stripped[] = "\n"; } - if ( drupal_strlen( $line ) > $this->MAX_LINE_LENGTH() ) { + if ( Unicode::strlen( $line ) > $this->MAX_LINE_LENGTH() ) { $words[] = $line; $stripped[] = $line; } @@ -1256,7 +1258,7 @@ function render() { break; case 'delete': foreach ($chunk->orig as $i => $piece) { - if (strpos($piece, '<') === 0 && drupal_substr($piece, drupal_strlen($piece) - 1) === '>') { + if (strpos($piece, '<') === 0 && Unicode::substr($piece, Unicode::strlen($piece) - 1) === '>') { $output .= $piece; } else { @@ -1267,7 +1269,7 @@ function render() { default: $chunk->closing = $this->process_chunk($chunk->closing); foreach ($chunk->closing as $i => $piece) { - if ($piece === ' ' || (strpos($piece, '<') === 0 && drupal_substr($piece, drupal_strlen($piece) - 1) === '>' && drupal_strtolower(drupal_substr($piece, 1, 3)) != 'img')) { + if ($piece === ' ' || (strpos($piece, '<') === 0 && Unicode::substr($piece, Unicode::strlen($piece) - 1) === '>' && Unicode::strtolower(Unicode::substr($piece, 1, 3)) != 'img')) { $output .= $piece; } else { @@ -1291,11 +1293,11 @@ function process_chunk($chunk) { if (!isset($processed[$j])) { $processed[$j] = ''; } - if (strpos($piece, '<') === 0 && drupal_substr($piece, drupal_strlen($piece) - 1) === '>') { + if (strpos($piece, '<') === 0 && Unicode::substr($piece, Unicode::strlen($piece) - 1) === '>') { $processed[$j] = $piece; $j++; } - elseif (isset($next) && strpos($next, '<') === 0 && drupal_substr($next, drupal_strlen($next) - 1) === '>') { + elseif (isset($next) && strpos($next, '<') === 0 && Unicode::substr($next, Unicode::strlen($next) - 1) === '>') { $processed[$j] .= $piece; $j++; } diff --git a/core/lib/Drupal/Component/Utility/Unicode.php b/core/lib/Drupal/Component/Utility/Unicode.php new file mode 100644 index 0000000..3e241ad --- /dev/null +++ b/core/lib/Drupal/Component/Utility/Unicode.php @@ -0,0 +1,644 @@ + $t('Standard PHP'), + UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'), + UNICODE_ERROR => $t('Error'), + ); + $severities = array( + UNICODE_SINGLEBYTE => REQUIREMENT_WARNING, + UNICODE_MULTIBYTE => NULL, + UNICODE_ERROR => REQUIREMENT_ERROR, + ); + $failed_check = unicode_check(); + $library = $GLOBALS['multibyte']; + + $requirements['unicode'] = array( + 'title' => $t('Unicode library'), + 'value' => $libraries[$library], + 'severity' => $severities[$library], + ); + $t_args = array('@url' => 'http://www.php.net/mbstring'); + switch ($failed_check) { + case 'mb_strlen': + $requirements['unicode']['description'] = $t('Operations on Unicode strings are emulated on a best-effort basis. Install the PHP mbstring extension for improved Unicode support.', $t_args); + break; + + case 'mbstring.func_overload': + $requirements['unicode']['description'] = $t('Multibyte string function overloading in PHP is active and must be disabled. Check the php.ini mbstring.func_overload setting. Please refer to the PHP mbstring documentation for more information.', $t_args); + break; + + case 'mbstring.encoding_translation': + $requirements['unicode']['description'] = $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini mbstring.encoding_translation setting. Please refer to the PHP mbstring documentation for more information.', $t_args); + break; + + case 'mbstring.http_input': + $requirements['unicode']['description'] = $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini mbstring.http_input setting. Please refer to the PHP mbstring documentation for more information.', $t_args); + break; + + case 'mbstring.http_output': + $requirements['unicode']['description'] = $t('Multibyte string output conversion in PHP is active and must be disabled. Check the php.ini mbstring.http_output setting. Please refer to the PHP mbstring documentation for more information.', $t_args); + break; + } + + return $requirements; + } + + /** + * Prepares a new XML parser. + * + * This is a wrapper around xml_parser_create() which extracts the encoding + * from the XML data first and sets the output encoding to UTF-8. This function + * should be used instead of xml_parser_create(), because PHP 4's XML parser + * doesn't check the input encoding itself. "Starting from PHP 5, the input + * encoding is automatically detected, so that the encoding parameter specifies + * only the output encoding." + * + * This is also where unsupported encodings will be converted. Callers should + * take this into account: $data might have been changed after the call. + * + * @param string $data + * The XML data which will be parsed later. + * + * @return resource + * An XML parser object or FALSE on error. + * + * @ingroup php_wrappers + */ + public static function createXMLParser(&$data) { + // Default XML encoding is UTF-8 + $encoding = 'utf-8'; + $bom = FALSE; + + // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it). + if (!strncmp($data, "\xEF\xBB\xBF", 3)) { + $bom = TRUE; + $data = substr($data, 3); + } + + // Check for an encoding declaration in the XML prolog if no BOM was found. + if (!$bom && preg_match('/^<\?xml[^>]+encoding="(.+?)"/', $data, $match)) { + $encoding = $match[1]; + } + + // Unsupported encodings are converted here into UTF-8. + $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii'); + if (!in_array(strtolower($encoding), $php_supported)) { + $out = self::convertToUTF8($data, $encoding); + if ($out !== FALSE) { + $encoding = 'utf-8'; + $data = preg_replace('/^(<\?xml[^>]+encoding)="(.+?)"/', '\\1="utf-8"', $out); + } + else { + watchdog('php', 'Could not convert XML encoding %s to UTF-8.', array('%s' => $encoding), WATCHDOG_WARNING); + return FALSE; + } + } + + $xml_parser = xml_parser_create($encoding); + xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8'); + return $xml_parser; + } + + /** + * Converts data to UTF-8. + * + * Requires the iconv, GNU recode or mbstring PHP extension. + * + * @param string $data + * The data to be converted. + * @param string $encoding + * The encoding that the data is in. + * + * @return bool|string + * Converted data or FALSE. + */ + public static function convertToUTF8($data, $encoding) { + if (function_exists('iconv')) { + $out = @iconv($encoding, 'utf-8', $data); + } + elseif (function_exists('mb_convert_encoding')) { + $out = @mb_convert_encoding($data, 'utf-8', $encoding); + } + elseif (function_exists('recode_string')) { + $out = @recode_string($encoding . '..utf-8', $data); + } + else { + watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR); + return FALSE; + } + + return $out; + } + + /** + * Truncates a UTF-8-encoded string safely to a number of bytes. + * + * If the end position is in the middle of a UTF-8 sequence, it scans backwards + * until the beginning of the byte sequence. + * + * Use this function whenever you want to chop off a string at an unsure + * location. On the other hand, if you're sure that you're splitting on a + * character boundary (e.g. after using strpos() or similar), you can safely + * use substr() instead. + * + * @param $string + * The string to truncate. + * @param $len + * An upper limit on the returned string length. + * + * @return + * The truncated string. + */ + public static function truncateBytes($string, $len) { + if (strlen($string) <= $len) { + return $string; + } + if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) { + return substr($string, 0, $len); + } + // Scan backwards to beginning of the byte sequence. + while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) < 0xC0); + + return substr($string, 0, $len); + } + + /** + * Truncates a UTF-8-encoded string safely to a number of characters. + * + * @param string $string + * The string to truncate. + * @param integer $max_length + * An upper limit on the returned string length, including trailing ellipsis + * if $add_ellipsis is TRUE. + * @param bool $wordsafe + * If TRUE, attempt to truncate on a word boundary. Word boundaries are + * spaces, punctuation, and Unicode characters used as word boundaries in + * non-Latin languages; see self::PREG_CLASS_UNICODE_WORD_BOUNDARY for more + * information. If a word boundary cannot be found that would make the length + * of the returned string fall within length guidelines (see parameters + * $max_length and $min_wordsafe_length), word boundaries are ignored. + * @param bool $add_ellipsis + * If TRUE, add t('...') to the end of the truncated string (defaults to + * FALSE). The string length will still fall within $max_length. + * @param integer $min_wordsafe_length + * If $wordsafe is TRUE, the minimum acceptable length for truncation (before + * adding an ellipsis, if $add_ellipsis is TRUE). Has no effect if $wordsafe + * is FALSE. This can be used to prevent having a very short resulting string + * that will not be understandable. For instance, if you are truncating the + * string "See myverylongurlexample.com for more information" to a word-safe + * return length of 20, the only available word boundary within 20 characters + * is after the word "See", which wouldn't leave a very informative string. If + * you had set $min_wordsafe_length to 10, though, the function would realise + * that "See" alone is too short, and would then just truncate ignoring word + * boundaries, giving you "See myverylongurl..." (assuming you had set + * $add_ellipses to TRUE). + * + * @return string + * The truncated string. + */ + public static function truncateUTF8($string, $max_length, $wordsafe = FALSE, $add_ellipsis = FALSE, $min_wordsafe_length = 1) { + $ellipsis = ''; + $max_length = max($max_length, 0); + $min_wordsafe_length = max($min_wordsafe_length, 0); + + if (Unicode::strlen($string) <= $max_length) { + // No truncation needed, so don't add ellipsis, just return. + return $string; + } + + if ($add_ellipsis) { + // Truncate ellipsis in case $max_length is small. + $ellipsis = self::substr(t('…'), 0, $max_length); + $max_length -= Unicode::strlen($ellipsis); + $max_length = max($max_length, 0); + } + + if ($max_length <= $min_wordsafe_length) { + // Do not attempt word-safe if lengths are bad. + $wordsafe = FALSE; + } + + if ($wordsafe) { + $matches = array(); + // Find the last word boundary, if there is one within $min_wordsafe_length + // to $max_length characters. preg_match() is always greedy, so it will + // find the longest string possible. + $found = preg_match('/^(.{' . $min_wordsafe_length . ',' . $max_length . '})[' . self::PREG_CLASS_UNICODE_WORD_BOUNDARY . ']/u', $string, $matches); + if ($found) { + $string = $matches[1]; + } + else { + $string = self::substr($string, 0, $max_length); + } + } + else { + $string = self::substr($string, 0, $max_length); + } + + if ($add_ellipsis) { + // If we're adding an ellipsis, remove any trailing periods. + $string = rtrim($string, '.'); + + $string .= $ellipsis; + } + + return $string; + } + + /** + * Encodes MIME/HTTP header values that contain incorrectly encoded characters. + * + * For example, self::MIMEHeaderEncode('tést.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=". + * + * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information. + * + * Notes: + * - Only encode strings that contain non-ASCII characters. + * - We progressively cut-off a chunk with self::truncateBytes(). This is to + * ensure each chunk starts and ends on a character boundary. + * - Using \n as the chunk separator may cause problems on some systems and may + * have to be changed to \r\n or \r. + * + * @param string $string + * The header to encode. + * + * @return string + * The mime-encoded header. + * + * @see self::MIMEHeaderDecode() + */ + public static function MIMEHeaderEncode($string) { + if (preg_match('/[^\x20-\x7E]/', $string)) { + $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75); + $len = strlen($string); + $output = ''; + while ($len > 0) { + $chunk = self::truncateBytes($string, $chunk_size); + $output .= ' =?UTF-8?B?' . base64_encode($chunk) . "?=\n"; + $c = strlen($chunk); + $string = substr($string, $c); + $len -= $c; + } + return trim($output); + } + return $string; + } + + /** + * Decodes MIME/HTTP encoded header values. + * + * @param string $header + * The header to decode. + * + * @return string + * The mime-decoded header. + * + * @see self::MIMEHeaderEncode() + */ + public static function MIMEHeaderDecode($header) { + // First step: encoded chunks followed by other encoded chunks (need to collapse whitespace) + $header = preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', 'self::MIMEHeaderDecodeCallback', $header); + // Second step: remaining chunks (do not collapse whitespace) + return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', 'self::MIMEHeaderDecodeCallback', $header); + } + + /** + * Decodes encoded header data passed from self::MIMEHeaderDecode(). + * + * Callback for preg_replace_callback() within self::MIMEHeaderDecode(). + * + * @param array $matches + * The array of matches from preg_replace_callback(). + * + * @return string + * The mime-decoded string. + * + * @see self::MIMEHeaderDecode() + */ + protected static function MIMEHeaderDecodeCallback(array $matches) { + // Regexp groups: + // 1: Character set name + // 2: Escaping method (Q or B) + // 3: Encoded data + $data = ($matches[2] == 'B') ? base64_decode($matches[3]) : str_replace('_', ' ', quoted_printable_decode($matches[3])); + if (strtolower($matches[1]) != 'utf-8') { + $data = self::convertToUTF8($data, $matches[1]); + } + return $data; + } + + /** + * Decodes all HTML entities (including numerical ones) to regular UTF-8 bytes. + * + * Double-escaped entities will only be decoded once ("&lt;" becomes "<" + * , not "<"). Be careful when using this function, as self::decodeEntities() + * can revert previous sanitization efforts (<script> will become + *