Index: includes/unicode.inc =================================================================== RCS file: /cvs/drupal/drupal/includes/unicode.inc,v retrieving revision 1.35 diff -u -p -r1.35 unicode.inc --- includes/unicode.inc 5 Nov 2008 12:58:59 -0000 1.35 +++ includes/unicode.inc 12 Nov 2008 22:57:53 -0000 @@ -326,69 +326,283 @@ function decode_entities($text, $exclude static $table; // We store named entities in a table for quick processing. if (!isset($table)) { - // Get all named HTML entities. - $table = array_flip(get_html_translation_table(HTML_ENTITIES)); - // PHP gives us ISO-8859-1 data, we need UTF-8. - $table = array_map('utf8_encode', $table); + $table = array ( + "Á" => "" . chr(195) . chr(129) . "", + "á" => "" . chr(195) . chr(161) . "", + "Â" => "" . chr(195) . chr(130) . "", + "â" => "" . chr(195) . chr(162) . "", + "´" => "" . chr(194) . chr(180) . "", + "Æ" => "" . chr(195) . chr(134) . "", + "æ" => "" . chr(195) . chr(166) . "", + "À" => "" . chr(195) . chr(128) . "", + "à" => "" . chr(195) . chr(160) . "", + "ℵ" => "" . chr(226) . chr(132) . chr(181) . "", + "Α" => "" . chr(206) . chr(145) . "", + "α" => "" . chr(206) . chr(177) . "", + "&" => "" . chr(38) . "", + "∧" => "" . chr(226) . chr(136) . chr(167) . "", + "∠" => "" . chr(226) . chr(136) . chr(160) . "", + "Å" => "" . chr(195) . chr(133) . "", + "å" => "" . chr(195) . chr(165) . "", + "≈" => "" . chr(226) . chr(137) . chr(136) . "", + "Ã" => "" . chr(195) . chr(131) . "", + "ã" => "" . chr(195) . chr(163) . "", + "Ä" => "" . chr(195) . chr(132) . "", + "ä" => "" . chr(195) . chr(164) . "", + "„" => "" . chr(226) . chr(128) . chr(158) . "", + "Β" => "" . chr(206) . chr(146) . "", + "β" => "" . chr(206) . chr(178) . "", + "¦" => "" . chr(194) . chr(166) . "", + "•" => "" . chr(226) . chr(128) . chr(162) . "", + "∩" => "" . chr(226) . chr(136) . chr(169) . "", + "Ç" => "" . chr(195) . chr(135) . "", + "ç" => "" . chr(195) . chr(167) . "", + "¸" => "" . chr(194) . chr(184) . "", + "¢" => "" . chr(194) . chr(162) . "", + "Χ" => "" . chr(206) . chr(167) . "", + "χ" => "" . chr(207) . chr(135) . "", + "ˆ" => "" . chr(203) . chr(134) . "", + "♣" => "" . chr(226) . chr(153) . chr(163) . "", + "≅" => "" . chr(226) . chr(137) . chr(133) . "", + "©" => "" . chr(194) . chr(169) . "", + "↵" => "" . chr(226) . chr(134) . chr(181) . "", + "∪" => "" . chr(226) . chr(136) . chr(170) . "", + "¤" => "" . chr(194) . chr(164) . "", + "†" => "" . chr(226) . chr(128) . chr(160) . "", + "‡" => "" . chr(226) . chr(128) . chr(161) . "", + "↓" => "" . chr(226) . chr(134) . chr(147) . "", + "⇓" => "" . chr(226) . chr(135) . chr(147) . "", + "°" => "" . chr(194) . chr(176) . "", + "Δ" => "" . chr(206) . chr(148) . "", + "δ" => "" . chr(206) . chr(180) . "", + "♦" => "" . chr(226) . chr(153) . chr(166) . "", + "÷" => "" . chr(195) . chr(183) . "", + "É" => "" . chr(195) . chr(137) . "", + "é" => "" . chr(195) . chr(169) . "", + "Ê" => "" . chr(195) . chr(138) . "", + "ê" => "" . chr(195) . chr(170) . "", + "È" => "" . chr(195) . chr(136) . "", + "è" => "" . chr(195) . chr(168) . "", + "∅" => "" . chr(226) . chr(136) . chr(133) . "", + " " => "" . chr(226) . chr(128) . chr(131) . "", + " " => "" . chr(226) . chr(128) . chr(130) . "", + "Ε" => "" . chr(206) . chr(149) . "", + "ε" => "" . chr(206) . chr(181) . "", + "≡" => "" . chr(226) . chr(137) . chr(161) . "", + "Η" => "" . chr(206) . chr(151) . "", + "η" => "" . chr(206) . chr(183) . "", + "Ð" => "" . chr(195) . chr(144) . "", + "ð" => "" . chr(195) . chr(176) . "", + "Ë" => "" . chr(195) . chr(139) . "", + "ë" => "" . chr(195) . chr(171) . "", + "€" => "" . chr(226) . chr(130) . chr(172) . "", + "∃" => "" . chr(226) . chr(136) . chr(131) . "", + "ƒ" => "" . chr(198) . chr(146) . "", + "∀" => "" . chr(226) . chr(136) . chr(128) . "", + "½" => "" . chr(194) . chr(189) . "", + "¼" => "" . chr(194) . chr(188) . "", + "¾" => "" . chr(194) . chr(190) . "", + "⁄" => "" . chr(226) . chr(129) . chr(132) . "", + "Γ" => "" . chr(206) . chr(147) . "", + "γ" => "" . chr(206) . chr(179) . "", + "≥" => "" . chr(226) . chr(137) . chr(165) . "", + "↔" => "" . chr(226) . chr(134) . chr(148) . "", + "⇔" => "" . chr(226) . chr(135) . chr(148) . "", + "♥" => "" . chr(226) . chr(153) . chr(165) . "", + "…" => "" . chr(226) . chr(128) . chr(166) . "", + "Í" => "" . chr(195) . chr(141) . "", + "í" => "" . chr(195) . chr(173) . "", + "Î" => "" . chr(195) . chr(142) . "", + "î" => "" . chr(195) . chr(174) . "", + "¡" => "" . chr(194) . chr(161) . "", + "Ì" => "" . chr(195) . chr(140) . "", + "ì" => "" . chr(195) . chr(172) . "", + "ℑ" => "" . chr(226) . chr(132) . chr(145) . "", + "∞" => "" . chr(226) . chr(136) . chr(158) . "", + "∫" => "" . chr(226) . chr(136) . chr(171) . "", + "Ι" => "" . chr(206) . chr(153) . "", + "ι" => "" . chr(206) . chr(185) . "", + "¿" => "" . chr(194) . chr(191) . "", + "∈" => "" . chr(226) . chr(136) . chr(136) . "", + "Ï" => "" . chr(195) . chr(143) . "", + "ï" => "" . chr(195) . chr(175) . "", + "Κ" => "" . chr(206) . chr(154) . "", + "κ" => "" . chr(206) . chr(186) . "", + "Λ" => "" . chr(206) . chr(155) . "", + "λ" => "" . chr(206) . chr(187) . "", + "⟨" => "" . chr(226) . chr(140) . chr(169) . "", + "«" => "" . chr(194) . chr(171) . "", + "←" => "" . chr(226) . chr(134) . chr(144) . "", + "⇐" => "" . chr(226) . chr(135) . chr(144) . "", + "⌈" => "" . chr(226) . chr(140) . chr(136) . "", + "“" => "" . chr(226) . chr(128) . chr(156) . "", + "≤" => "" . chr(226) . chr(137) . chr(164) . "", + "⌊" => "" . chr(226) . chr(140) . chr(138) . "", + "∗" => "" . chr(226) . chr(136) . chr(151) . "", + "◊" => "" . chr(226) . chr(151) . chr(138) . "", + "‎" => "" . chr(226) . chr(128) . chr(142) . "", + "‹" => "" . chr(226) . chr(128) . chr(185) . "", + "‘" => "" . chr(226) . chr(128) . chr(152) . "", + "¯" => "" . chr(194) . chr(175) . "", + "—" => "" . chr(226) . chr(128) . chr(148) . "", + "µ" => "" . chr(194) . chr(181) . "", + "·" => "" . chr(194) . chr(183) . "", + "−" => "" . chr(226) . chr(136) . chr(146) . "", + "Μ" => "" . chr(206) . chr(156) . "", + "μ" => "" . chr(206) . chr(188) . "", + "∇" => "" . chr(226) . chr(136) . chr(135) . "", + " " => "" . chr(194) . chr(160) . "", + "–" => "" . chr(226) . chr(128) . chr(147) . "", + "≠" => "" . chr(226) . chr(137) . chr(160) . "", + "∋" => "" . chr(226) . chr(136) . chr(139) . "", + "¬" => "" . chr(194) . chr(172) . "", + "∉" => "" . chr(226) . chr(136) . chr(137) . "", + "⊄" => "" . chr(226) . chr(138) . chr(132) . "", + "Ñ" => "" . chr(195) . chr(145) . "", + "ñ" => "" . chr(195) . chr(177) . "", + "Ν" => "" . chr(206) . chr(157) . "", + "ν" => "" . chr(206) . chr(189) . "", + "Ó" => "" . chr(195) . chr(147) . "", + "ó" => "" . chr(195) . chr(179) . "", + "Ô" => "" . chr(195) . chr(148) . "", + "ô" => "" . chr(195) . chr(180) . "", + "Œ" => "" . chr(197) . chr(146) . "", + "œ" => "" . chr(197) . chr(147) . "", + "Ò" => "" . chr(195) . chr(146) . "", + "ò" => "" . chr(195) . chr(178) . "", + "‾" => "" . chr(226) . chr(128) . chr(190) . "", + "Ω" => "" . chr(206) . chr(169) . "", + "ω" => "" . chr(207) . chr(137) . "", + "Ο" => "" . chr(206) . chr(159) . "", + "ο" => "" . chr(206) . chr(191) . "", + "⊕" => "" . chr(226) . chr(138) . chr(149) . "", + "∨" => "" . chr(226) . chr(136) . chr(168) . "", + "ª" => "" . chr(194) . chr(170) . "", + "º" => "" . chr(194) . chr(186) . "", + "Ø" => "" . chr(195) . chr(152) . "", + "ø" => "" . chr(195) . chr(184) . "", + "Õ" => "" . chr(195) . chr(149) . "", + "õ" => "" . chr(195) . chr(181) . "", + "⊗" => "" . chr(226) . chr(138) . chr(151) . "", + "Ö" => "" . chr(195) . chr(150) . "", + "ö" => "" . chr(195) . chr(182) . "", + "¶" => "" . chr(194) . chr(182) . "", + "∂" => "" . chr(226) . chr(136) . chr(130) . "", + "‰" => "" . chr(226) . chr(128) . chr(176) . "", + "⊥" => "" . chr(226) . chr(138) . chr(165) . "", + "Φ" => "" . chr(206) . chr(166) . "", + "φ" => "" . chr(207) . chr(134) . "", + "Π" => "" . chr(206) . chr(160) . "", + "π" => "" . chr(207) . chr(128) . "", + "ϖ" => "" . chr(207) . chr(150) . "", + "±" => "" . chr(194) . chr(177) . "", + "£" => "" . chr(194) . chr(163) . "", + "′" => "" . chr(226) . chr(128) . chr(178) . "", + "″" => "" . chr(226) . chr(128) . chr(179) . "", + "∏" => "" . chr(226) . chr(136) . chr(143) . "", + "∝" => "" . chr(226) . chr(136) . chr(157) . "", + "Ψ" => "" . chr(206) . chr(168) . "", + "ψ" => "" . chr(207) . chr(136) . "", + "√" => "" . chr(226) . chr(136) . chr(154) . "", + "⟩" => "" . chr(226) . chr(140) . chr(170) . "", + "»" => "" . chr(194) . chr(187) . "", + "→" => "" . chr(226) . chr(134) . chr(146) . "", + "⇒" => "" . chr(226) . chr(135) . chr(146) . "", + "⌉" => "" . chr(226) . chr(140) . chr(137) . "", + "”" => "" . chr(226) . chr(128) . chr(157) . "", + "ℜ" => "" . chr(226) . chr(132) . chr(156) . "", + "®" => "" . chr(194) . chr(174) . "", + "⌋" => "" . chr(226) . chr(140) . chr(139) . "", + "Ρ" => "" . chr(206) . chr(161) . "", + "ρ" => "" . chr(207) . chr(129) . "", + "‏" => "" . chr(226) . chr(128) . chr(143) . "", + "›" => "" . chr(226) . chr(128) . chr(186) . "", + "’" => "" . chr(226) . chr(128) . chr(153) . "", + "‚" => "" . chr(226) . chr(128) . chr(154) . "", + "Š" => "" . chr(197) . chr(160) . "", + "š" => "" . chr(197) . chr(161) . "", + "⋅" => "" . chr(226) . chr(139) . chr(133) . "", + "§" => "" . chr(194) . chr(167) . "", + "­" => "" . chr(194) . chr(173) . "", + "Σ" => "" . chr(206) . chr(163) . "", + "σ" => "" . chr(207) . chr(131) . "", + "ς" => "" . chr(207) . chr(130) . "", + "∼" => "" . chr(226) . chr(136) . chr(188) . "", + "♠" => "" . chr(226) . chr(153) . chr(160) . "", + "⊂" => "" . chr(226) . chr(138) . chr(130) . "", + "⊆" => "" . chr(226) . chr(138) . chr(134) . "", + "∑" => "" . chr(226) . chr(136) . chr(145) . "", + "¹" => "" . chr(194) . chr(185) . "", + "²" => "" . chr(194) . chr(178) . "", + "³" => "" . chr(194) . chr(179) . "", + "⊃" => "" . chr(226) . chr(138) . chr(131) . "", + "⊇" => "" . chr(226) . chr(138) . chr(135) . "", + "ß" => "" . chr(195) . chr(159) . "", + "Τ" => "" . chr(206) . chr(164) . "", + "τ" => "" . chr(207) . chr(132) . "", + "∴" => "" . chr(226) . chr(136) . chr(180) . "", + "Θ" => "" . chr(206) . chr(152) . "", + "θ" => "" . chr(206) . chr(184) . "", + "ϑ" => "" . chr(207) . chr(145) . "", + " " => "" . chr(226) . chr(128) . chr(137) . "", + "Þ" => "" . chr(195) . chr(158) . "", + "þ" => "" . chr(195) . chr(190) . "", + "˜" => "" . chr(203) . chr(156) . "", + "×" => "" . chr(195) . chr(151) . "", + "™" => "" . chr(226) . chr(132) . chr(162) . "", + "Ú" => "" . chr(195) . chr(154) . "", + "ú" => "" . chr(195) . chr(186) . "", + "↑" => "" . chr(226) . chr(134) . chr(145) . "", + "⇑" => "" . chr(226) . chr(135) . chr(145) . "", + "Û" => "" . chr(195) . chr(155) . "", + "û" => "" . chr(195) . chr(187) . "", + "Ù" => "" . chr(195) . chr(153) . "", + "ù" => "" . chr(195) . chr(185) . "", + "¨" => "" . chr(194) . chr(168) . "", + "ϒ" => "" . chr(207) . chr(146) . "", + "Υ" => "" . chr(206) . chr(165) . "", + "υ" => "" . chr(207) . chr(133) . "", + "Ü" => "" . chr(195) . chr(156) . "", + "ü" => "" . chr(195) . chr(188) . "", + "℘" => "" . chr(226) . chr(132) . chr(152) . "", + "Ξ" => "" . chr(206) . chr(158) . "", + "ξ" => "" . chr(206) . chr(190) . "", + "Ý" => "" . chr(195) . chr(157) . "", + "ý" => "" . chr(195) . chr(189) . "", + "¥" => "" . chr(194) . chr(165) . "", + "ÿ" => "" . chr(195) . chr(191) . "", + "Ÿ" => "" . chr(197) . chr(184) . "", + "Ζ" => "" . chr(206) . chr(150) . "", + "ζ" => "" . chr(206) . chr(182) . "", + "‍" => "" . chr(226) . chr(128) . chr(141) . "", + "‌" => "" . chr(226) . chr(128) . chr(140) . "", + ">" => ">", + "<" => "<", // Add apostrophe (XML) - $table['''] = "'"; + ''' => "'", + ); } $newtable = array_diff($table, $exclude); - - // Use a regexp to select all entities in one pass, to avoid decoding - // double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is - // being used to allow for a callback (see - // http://php.net/manual/en/reference.pcre.pattern.modifiers). - return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $newtable, $exclude)', $text); + $return_text = strtr($text, $newtable); + $return_text = preg_replace('~&#x([0-9a-f]+);~ei', '_code_to_utf8(hexdec("\\1"))', $return_text); + $return_text = preg_replace('~&#([0-9]+);~e', '_code_to_utf8(\\1)', $return_text); + return $return_text; } /** * Helper function for decode_entities */ -function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) { - // Named entity - if (!$prefix) { - if (isset($table[$original])) { - return $table[$original]; - } - else { - return $original; - } - } - // Hexadecimal numerical entity - if ($prefix == '#x') { - $codepoint = base_convert($codepoint, 16, 10); - } - // Decimal numerical entity (strip leading zeros to avoid PHP octal notation) - else { - $codepoint = preg_replace('/^0+/', '', $codepoint); - } - // Encode codepoint as UTF-8 bytes - if ($codepoint < 0x80) { - $str = chr($codepoint); - } - elseif ($codepoint < 0x800) { - $str = chr(0xC0 | ($codepoint >> 6)) - . chr(0x80 | ($codepoint & 0x3F)); - } - elseif ($codepoint < 0x10000) { - $str = chr(0xE0 | ( $codepoint >> 12)) - . chr(0x80 | (($codepoint >> 6) & 0x3F)) - . chr(0x80 | ( $codepoint & 0x3F)); - } - elseif ($codepoint < 0x200000) { - $str = chr(0xF0 | ( $codepoint >> 18)) - . chr(0x80 | (($codepoint >> 12) & 0x3F)) - . chr(0x80 | (($codepoint >> 6) & 0x3F)) - . chr(0x80 | ( $codepoint & 0x3F)); - } - // Check for excluded characters - if (in_array($str, $exclude)) { - return $original; - } - else { - return $str; +function _code_to_utf8($num) { + if ($num <= 0x7F) { + return chr($num); + } elseif ($num <= 0x7FF) { + return chr(($num >> 0x06) + 0xC0) . chr(($num & 0x3F) + 128); + } elseif ($num <= 0xFFFF) { + return chr(($num >> 0x0C) + 0xE0) . chr((($num >> 0x06) & 0x3F) + 0x80) . chr(($num & 0x3F) + 0x80); + } elseif ($num <= 0x1FFFFF) { + return chr(($num >> 0x12) + 0xF0) . chr((($num >> 0x0C) & 0x3F) + 0x80) . chr((($num >> 0x06) & 0x3F) + 0x80) . chr(($num & 0x3F) + 0x80); } + return ' '; // default value } /**