Index: drupal.js
===================================================================
RCS file: /cvs/drupal/drupal/misc/drupal.js,v
retrieving revision 1.45
diff -u -p -r1.45 drupal.js
--- drupal.js	25 Jun 2008 07:45:03 -0000	1.45
+++ drupal.js	6 Sep 2008 18:47:30 -0000
@@ -8,6 +8,11 @@ var Drupal = Drupal || { 'settings': {},
 Drupal.jsEnabled = document.getElementsByTagName && document.createElement && document.createTextNode && document.documentElement && document.getElementById;
 
 /**
+ * Set the array of allowed HTML tags.
+ */
+Drupal.allowedTags = ['a', 'em', 'strong', 'cite', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd'];
+
+/**
  * Attach all registered behaviors to a page element.
  *
  * Behaviors are event-triggered actions that attach to page elements, enhancing
@@ -43,6 +48,358 @@ Drupal.attachBehaviors = function(contex
 };
 
 /**
+ * Checks whether a string is valid UTF-8.
+ */
+Drupal.validateUTF8 = function (text) {
+  if (text.length == 0) {
+    return true;
+  }
+  var regex = new RegExp("^.", "us");
+  var result = text.match(regex);
+  return result.length;
+}
+
+/**
+ * Filters XSS.
+ *
+ * See the documentation of the server-side filter_xss() function for further details.
+ */
+Drupal.filterXSS = function (str, allowedTags) {
+  // Setup default values if none supplied.
+  Drupal.allowedTags = typeof(allowedTags) != 'undefined' ? allowedTags : Drupal.allowedTags;
+  str = String(str);
+  var regex = '';
+  var replace = {
+    '\\0': '', // Remove NUL characters (ignored by some browsers).
+    '&\\s*\\{[^}]*(\\}\\s*;?|$)': '', // Remove Netscape 4 JS entities.
+    '&': '&amp;', // Defuse all HTML entities.
+    '&amp;([A-Za-z][A-Za-z0-9]*;)': '&\1', // Change back only well-formed entities in our whitelist Named entities.
+    '&amp;#([0-9]+;)': '&#\1', // Decimal numeric entities.
+    '&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)': '&#x\1' // Hexadecimal numeric entities.
+  };
+  for (var character in replace) {
+    regex = new RegExp(character, 'g');
+    str = str.replace(regex, replace[character]);
+  }
+
+  str = str.replace(/(<(?=[^a-zA-Z!\/])|<[^>]*(>|$)|>)/, Drupal.filterXSSSplit);
+  return str;
+}
+
+/**
+ * Processes an HTML tag.
+ */
+Drupal.filterXSSSplit = function (m, key, value) {
+  var slash = elem = attrlist = xhtmlSlash = attr2 = xssAttributesList = '';
+  var str = m;
+  var matches = [];
+
+  if (str.substring(0, 1) != '<') {
+    // We matched a lone ">" character
+    return '&gt;';
+  }
+  else if (str.length == 1) {
+    // We matched a lone "<" character
+    return '&lt;';
+  }
+
+  matches = str.match(/^<\s*(\/\s*)?([a-zA-Z0-9]+)([^>]*)>?$/);
+  if (!matches.length) {
+    // Seriously malformed
+    return '';
+  }
+
+  slash = jQuery.trim(matches[1]);
+  elem = matches[2];
+  attrlist = matches[3];
+
+  if (!Drupal.allowedTags[elem.toLowerCase()]) {
+    // Disallowed HTML element
+    return '';
+  }
+
+  if (slash != '') {
+    return "</" + elem + ">";
+  }
+
+  // Is there a closing XHTML slash at the end of the attributes?
+  // In PHP 5.1.0+ we could count the changes, currently we need a separate match
+  xhtmlSlash = attrlist.match(/\s?\/\s*$/) ? ' /' : '';
+  attrlist = attrlist.replace(/(\s?)\/\s*$/, '\1');
+
+  // Clean up attributes
+  xssAttributesList = Drupal.filterXSSAttributes(attrlist);
+  attr2 = xssAttributesList.join(' ');
+  attr2 = attr2.replace(/[<>]/, '');
+  attr2 = attr2.length ? ' ' + attr2 : '';
+
+  return "<" + elem + attr2 + xhtmlSlash +">";
+}
+
+/**
+ * Processes a string of HTML attributes.
+ */
+Drupal.filterXSSAttributes = function (attr) {
+  var matches = attrarr = [];
+  var mode = 0;
+  var attrname = '';
+
+  while (attr.length != 0) {
+    // Was the last operation successful?
+    working = 0;
+
+    switch (mode) {
+      case 0:
+        // Attribute name, href for instance
+        matches = attr.match(/^([-a-zA-Z]+)/);
+        if (matches.length) {
+          attrname = matches[1].toLowerCase();
+          skip = (attrname == 'style' || attrname.substring(0, 2) == 'on');
+          working = mode = 1;
+          attr = attr.replace(/^[-a-zA-Z]+/, '');
+        }
+
+        break;
+
+      case 1:
+        // Equals sign or valueless ("selected")
+        matches = attr.match(/^\s*=\s*/);
+        if (matches.length) {
+          working = 1; mode = 2;
+          attr = attr.replace(/^\s*=\s*/, '');
+          break;
+        }
+
+        matches = attr.match(/^\s+/);
+        if (matches.length) {
+          working = 1; mode = 0;
+          if (!skip) {
+            attrarr[attrarr.length++] = attrname;
+          }
+          attr = attr.replace(/^\s+/, '');
+        }
+
+        break;
+
+      case 2:
+        // Attribute value, a URL after href= for instance
+        matches = attr.match(/^"([^"]*)"(\s+|$)/); 
+        if (matches.length) {
+          thisval = Drupal.filterXSSBadProtocol(matches[1]);
+
+          if (!skip) {
+            attrarr[attrarr.length++] = "attrname=\"" + thisval + "\"";
+          }
+          working = 1;
+          mode = 0;
+          attr = attr.replace(/^"[^"]*"(\s+|$)/, '');
+          break;
+        }
+
+        matches = attr.match(/^'([^']*)'(\s+|$)/);
+        if (matches.length) {
+          thisval = Drupal.filterXSSBadProtocol(matches[1]);
+
+          if (!skip) {
+            attrarr[attrarr.length++] = "attrname=\"" + thisval + "\"";
+          }
+          working = 1; mode = 0;
+          attr = attr.replace(/^'[^']*'(\s+|$)/, '');
+          break;
+        }
+
+        matches = attr.match(/^([^\s\"']+)(\s+|$)/);
+        if (matches.length) {
+          thisval = Drupal.filterXSSBadProtocol(matches[1]);
+
+          if (!skip) {
+            attrarr[attrarr.length++] = "attrname=\"" + thisval + "\"";
+          }
+          working = 1; mode = 0;
+          attr = attr.replace(/^[^\s\"']+(\s+|$)/, '');
+        }
+
+        break;
+    }
+
+    if (working == 0) {
+      // not well formed, remove and try again
+      attr = attr.replace(/^("[^"]*("|$)|\'[^\']*(\'|$)||\S)*\s*/, '');
+      mode = 0;
+    }
+  }
+
+  // the attribute list ends with a valueless attribute like "selected"
+  if (mode == 1) {
+    attrarr[attrarr.length++] = attrname;
+  }
+  return attrarr;
+}
+
+/**
+ * Processes an HTML attribute value and ensures it does not contain an URL with
+ * a disallowed protocol (e.g. javascript:).
+ */
+Drupal.filterXSSBadProtocol = function (str) {
+  allowedProtocols = ['http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'mailto', 'irc', 'ssh', 'sftp', 'webcal'];
+
+  // Get the plain text representation of the attribute value (i.e. its meaning).
+  str = Drupal.htmlEntityDecode(str);
+
+  // Iteratively remove any invalid protocol found.
+
+  do {
+    before = str;
+    colonpos = strpos(str, ':');
+    if (colonpos > 0) {
+      // We found a colon, possibly a protocol. Verify.
+      protocol = str.substring(0, colonpos);
+      // If a colon is preceded by a slash, question mark or hash, it cannot
+      // possibly be part of the URL scheme. This must be a relative URL,
+      // which inherits the (safe) protocol of the base document.
+      if (protocol.match(/![\/?#]!/)) {
+        break;
+      }
+      // Per RFC2616, section 3.2.3 (URI Comparison) scheme comparison must be case-insensitive
+      // Check if this is a disallowed protocol.
+      if (!allowedProtocols[protocol.toLowerCase()]) {
+        str = str.substring(colonpos + 1);
+      }
+    }
+  } while (before != str);
+
+  return Drupal.checkPlain(str);
+}
+
+/**
+ * Decode all HTML entities (including numerical ones) to regular UTF-8 bytes.
+ *
+ * Double-escaped entities will only be decoded once ("&amp;lt;" becomes "&lt;",
+ * not "<").
+ */
+Drupal.htmlEntityDecode = function (str) {
+    
+  var histogram = {}, histogramR = {}, code = 0;
+  var entity = chr = '';
+  
+  histogram['34'] = 'quot';
+  histogram['38'] = 'amp';
+  histogram['60'] = 'lt';
+  histogram['62'] = 'gt';
+  histogram['160'] = 'nbsp';
+  histogram['161'] = 'iexcl';
+  histogram['162'] = 'cent';
+  histogram['163'] = 'pound';
+  histogram['164'] = 'curren';
+  histogram['165'] = 'yen';
+  histogram['166'] = 'brvbar';
+  histogram['167'] = 'sect';
+  histogram['168'] = 'uml';
+  histogram['169'] = 'copy';
+  histogram['170'] = 'ordf';
+  histogram['171'] = 'laquo';
+  histogram['172'] = 'not';
+  histogram['173'] = 'shy';
+  histogram['174'] = 'reg';
+  histogram['175'] = 'macr';
+  histogram['176'] = 'deg';
+  histogram['177'] = 'plusmn';
+  histogram['178'] = 'sup2';
+  histogram['179'] = 'sup3';
+  histogram['180'] = 'acute';
+  histogram['181'] = 'micro';
+  histogram['182'] = 'para';
+  histogram['183'] = 'middot';
+  histogram['184'] = 'cedil';
+  histogram['185'] = 'sup1';
+  histogram['186'] = 'ordm';
+  histogram['187'] = 'raquo';
+  histogram['188'] = 'frac14';
+  histogram['189'] = 'frac12';
+  histogram['190'] = 'frac34';
+  histogram['191'] = 'iquest';
+  histogram['192'] = 'Agrave';
+  histogram['193'] = 'Aacute';
+  histogram['194'] = 'Acirc';
+  histogram['195'] = 'Atilde';
+  histogram['196'] = 'Auml';
+  histogram['197'] = 'Aring';
+  histogram['198'] = 'AElig';
+  histogram['199'] = 'Ccedil';
+  histogram['200'] = 'Egrave';
+  histogram['201'] = 'Eacute';
+  histogram['202'] = 'Ecirc';
+  histogram['203'] = 'Euml';
+  histogram['204'] = 'Igrave';
+  histogram['205'] = 'Iacute';
+  histogram['206'] = 'Icirc';
+  histogram['207'] = 'Iuml';
+  histogram['208'] = 'ETH';
+  histogram['209'] = 'Ntilde';
+  histogram['210'] = 'Ograve';
+  histogram['211'] = 'Oacute';
+  histogram['212'] = 'Ocirc';
+  histogram['213'] = 'Otilde';
+  histogram['214'] = 'Ouml';
+  histogram['215'] = 'times';
+  histogram['216'] = 'Oslash';
+  histogram['217'] = 'Ugrave';
+  histogram['218'] = 'Uacute';
+  histogram['219'] = 'Ucirc';
+  histogram['220'] = 'Uuml';
+  histogram['221'] = 'Yacute';
+  histogram['222'] = 'THORN';
+  histogram['223'] = 'szlig';
+  histogram['224'] = 'agrave';
+  histogram['225'] = 'aacute';
+  histogram['226'] = 'acirc';
+  histogram['227'] = 'atilde';
+  histogram['228'] = 'auml';
+  histogram['229'] = 'aring';
+  histogram['230'] = 'aelig';
+  histogram['231'] = 'ccedil';
+  histogram['232'] = 'egrave';
+  histogram['233'] = 'eacute';
+  histogram['234'] = 'ecirc';
+  histogram['235'] = 'euml';
+  histogram['236'] = 'igrave';
+  histogram['237'] = 'iacute';
+  histogram['238'] = 'icirc';
+  histogram['239'] = 'iuml';
+  histogram['240'] = 'eth';
+  histogram['241'] = 'ntilde';
+  histogram['242'] = 'ograve';
+  histogram['243'] = 'oacute';
+  histogram['244'] = 'ocirc';
+  histogram['245'] = 'otilde';
+  histogram['246'] = 'ouml';
+  histogram['247'] = 'divide';
+  histogram['248'] = 'oslash';
+  histogram['249'] = 'ugrave';
+  histogram['250'] = 'uacute';
+  histogram['251'] = 'ucirc';
+  histogram['252'] = 'uuml';
+  histogram['253'] = 'yacute';
+  histogram['254'] = 'thorn';
+  histogram['255'] = 'yuml';
+  
+  // Reverse table. Cause for maintainability purposes, the histogram is 
+  // identical to the one in htmlentities.
+  for (code in histogram) {
+    entity = histogram[code];
+    histogramR[entity] = code; 
+  }
+  
+  return str.replace(/(\&([a-zA-Z]+)\;)/g, function(full, m1, m2){
+    if (m2 in histogramR) {
+      return str.fromCharCode(histogramR[m2]);
+    } else {
+      return m2;
+    }
+  });    
+}
+
+/**
  * Encode special characters in a plain-text string for display as HTML.
  */
 Drupal.checkPlain = function(str) {
