If HtmlAgilityPackSanitizerProvider is provided with markup that has previously been sanitized, entities in attributes will be encoded again. The effect of this is that each character of an entity (e.g. :) becomes encoded individually. The markup passed through balloons with the result that a few hundred characters of user submitted text becomes megabytes very quickly.
I have amended my CleanAttributeValues method:
private void CleanAttributeValues(HtmlAttribute attribute)
{
// Commented because HttpUtility doesn't understand hex entities and so doubly encodes them.
//attribute.Value = HttpUtility.HtmlEncode(attribute.Value);
attribute.Value = Regex.Replace(attribute.Value, @"\s*j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*", "", RegexOptions.IgnoreCase);
attribute.Value = Regex.Replace(attribute.Value, @"\s*s\s*c\s*r\s*i\s*p\s*t\s*", "", RegexOptions.IgnoreCase);
if (attribute.Name.ToLower() == "style")
{
attribute.Value = Regex.Replace(attribute.Value, @"\s*e\s*x\s*p\s*r\s*e\s*s\s*s\s*i\s*o\s*n\s*", "", RegexOptions.IgnoreCase);
attribute.Value = Regex.Replace(attribute.Value, @"\s*b\s*e\s*h\s*a\s*v\s*i\s*o\s*r\s*", "", RegexOptions.IgnoreCase);
}
if (attribute.Name.ToLower() == "href" || attribute.Name.ToLower() == "src")
{
//if (!attribute.Value.StartsWith("http://") || attribute.Value.StartsWith("/"))
// attribute.Value = "";
attribute.Value = Regex.Replace(attribute.Value, @"\s*m\s*o\s*c\s*h\s*a\s*", "", RegexOptions.IgnoreCase);
}
// Don't doubly escape characters (which leads to ever expanding markup)
var pattern = @"(
\&\#x[0-9,A-F]{2,4};| # Match a Hex entity
\&\#[0-9]{2,4};| # Or a decimal entity
\&\w+;| # Or an alphabetic named entity
. # Else match any single character
)";
Regex regex = new Regex(pattern, RegexOptions.Compiled |
RegexOptions.IgnoreCase |
RegexOptions.IgnorePatternWhitespace);
StringBuilder unescaped = new StringBuilder();
foreach (Match match in regex.Matches(attribute.Value))
{
if (match.Groups[0].Length == 1)
{
char c = match.Groups[0].Value.ToCharArray()[0];
unescaped.Append(EncodeCharacterToHtmlEntityEscape(c));
}
else
{
unescaped.Append(match.Groups[0].Value);
}
}
attribute.Value = unescaped.ToString();
// Commented out to avoid double encoding characters
//// HtmlEntity Escape
//StringBuilder sbAttriuteValue = new StringBuilder();
//foreach (char c in attribute.Value.ToCharArray())
//{
// sbAttriuteValue.Append(EncodeCharacterToHtmlEntityEscape(c));
//}
//attribute.Value = sbAttriuteValue.ToString();
}
I have amended my CleanAttributeValues method:
private void CleanAttributeValues(HtmlAttribute attribute)
{
// Commented because HttpUtility doesn't understand hex entities and so doubly encodes them.
//attribute.Value = HttpUtility.HtmlEncode(attribute.Value);
attribute.Value = Regex.Replace(attribute.Value, @"\s*j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*", "", RegexOptions.IgnoreCase);
attribute.Value = Regex.Replace(attribute.Value, @"\s*s\s*c\s*r\s*i\s*p\s*t\s*", "", RegexOptions.IgnoreCase);
if (attribute.Name.ToLower() == "style")
{
attribute.Value = Regex.Replace(attribute.Value, @"\s*e\s*x\s*p\s*r\s*e\s*s\s*s\s*i\s*o\s*n\s*", "", RegexOptions.IgnoreCase);
attribute.Value = Regex.Replace(attribute.Value, @"\s*b\s*e\s*h\s*a\s*v\s*i\s*o\s*r\s*", "", RegexOptions.IgnoreCase);
}
if (attribute.Name.ToLower() == "href" || attribute.Name.ToLower() == "src")
{
//if (!attribute.Value.StartsWith("http://") || attribute.Value.StartsWith("/"))
// attribute.Value = "";
attribute.Value = Regex.Replace(attribute.Value, @"\s*m\s*o\s*c\s*h\s*a\s*", "", RegexOptions.IgnoreCase);
}
// Don't doubly escape characters (which leads to ever expanding markup)
var pattern = @"(
\&\#x[0-9,A-F]{2,4};| # Match a Hex entity
\&\#[0-9]{2,4};| # Or a decimal entity
\&\w+;| # Or an alphabetic named entity
. # Else match any single character
)";
Regex regex = new Regex(pattern, RegexOptions.Compiled |
RegexOptions.IgnoreCase |
RegexOptions.IgnorePatternWhitespace);
StringBuilder unescaped = new StringBuilder();
foreach (Match match in regex.Matches(attribute.Value))
{
if (match.Groups[0].Length == 1)
{
char c = match.Groups[0].Value.ToCharArray()[0];
unescaped.Append(EncodeCharacterToHtmlEntityEscape(c));
}
else
{
unescaped.Append(match.Groups[0].Value);
}
}
attribute.Value = unescaped.ToString();
// Commented out to avoid double encoding characters
//// HtmlEntity Escape
//StringBuilder sbAttriuteValue = new StringBuilder();
//foreach (char c in attribute.Value.ToCharArray())
//{
// sbAttriuteValue.Append(EncodeCharacterToHtmlEntityEscape(c));
//}
//attribute.Value = sbAttriuteValue.ToString();
}