以下代码的目的是能够接受来自可能包含HTML的cliënts的字符串,并删除样式,脚本,某些标记并用B标记替换H标记。
private IDictionary<string, string[]> Whitelist;
public vacatures PostPutVacancy(vacancy vacancy)
{
//List of allowed tags
Whitelist = new Dictionary<string, string[]> {
{ "p", null },
{ "ul", null },
{ "li", null },
{ "br", null },
{ "b", null },
{ "table", null },
{ "tr", null },
{ "th", null },
{ "td", null },
{ "strong", null }
};
foreach (var item in vacancy.GetType().GetProperties())
{
if (vacancy.GetType().GetProperty(item.Name).PropertyType.FullName.Contains("String"))
{
var value = item.GetValue(vacancy, null);
if (value != null)
{
item.SetValue(vacancy, CallSanitizers(item.GetValue(vacancy, null)));
var test1 = item.GetValue(vacancy);
}
}
}
return vacancy;
}
private List<string> hList = new List<string>
{
{ "h1"},
{ "h2"},
{ "h3"},
{ "h4"},
{ "h5"},
{ "h6"}
};
private string CallSanitizers(object obj)//==Sanitize()
{
string str = obj.ToString();
if (str != HttpUtility.HtmlEncode(str))
{
doc.LoadHtml(str);
SanitizeNode(doc.DocumentNode);
string test = doc.DocumentNode.WriteTo().Trim();
return doc.DocumentNode.WriteTo().Trim();
}
else
{
return str;
}
}
private void SanitizeChildren(HtmlNode parentNode)
{
for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
{
SanitizeNode(parentNode.ChildNodes[i]);
}
}
private void SanitizeNode(HtmlNode node)
{
if (node.NodeType == HtmlNodeType.Element)
{
if (!Whitelist.ContainsKey(node.Name))
{
if (hList.Contains(node.Name))
{
HtmlNode b = doc.CreateElement("b");
b.InnerHtml = node.InnerHtml;
node.ParentNode.ReplaceChild(b, node);
}
else
{
node.ParentNode.RemoveChild(node, true);
}
}
if (node.HasAttributes)
{
for (int i = node.Attributes.Count - 1; i >= 0; i--)
{
HtmlAttribute currentAttribute = node.Attributes[i];
node.Attributes.Remove(currentAttribute);
}
}
}
if (node.HasChildNodes)
{
SanitizeChildren(node);
}
}
它有效但有一个问题,子节点的子节点没有被消毒,参见示例。
输入:
"Lorem ipsum<h1 style='font-size:38px;'><p style='font-size:38px;'>dolor sit</p></h1> amet <h1 style='font-size:38px;'><strong style='font-size:38px;'>consectetur adipiscing</strong></h1>"
结果:
"Lorem ipsum<b><p style='font-size:38px;'>dolor sit</p></b> amet <b style='font-size:38px;'><strong style='font-size:38px;'>consectetur adipiscing</strong></b>"
问题必然是由于无法将孩子放回更改后的父母,因为由于标签类型的更改而不再识别父母。
有人知道如何解决这个问题吗?
如果问题不清楚或制定不当,请发表评论。
提前致谢
答案 0 :(得分:0)
这解决了它
private string CallSanitizers(string str)
{
if (str != HttpUtility.HtmlEncode(str))
{
doc.LoadHtml(str);
str = Sanitizers();
return doc.DocumentNode.WriteTo().Trim();
}
else
{
return str;
}
}
private string Sanitizers()
{
doc.DocumentNode.Descendants().Where(l => l.Name == "script" || l.Name == "style").ToList().ForEach(l => l.Remove());
doc.DocumentNode.Descendants().Where(l => hList.Contains(l.Name)).ToList().ForEach(l => l.Name = "b");
doc.DocumentNode.Descendants().Where(l => l.Attributes != null).ToList().ForEach(l => l.Attributes.ToList().ForEach(a => a.Remove()));
doc.DocumentNode.Descendants().Where(l => !Whitelist.Contains(l.Name) && l.NodeType == HtmlNodeType.Element).ToList().ForEach(l => l.ParentNode.RemoveChild(l, true));
return doc.DocumentNode.OuterHtml;
}
//lijst van tags die worden vervangen door <b></b>
private List<string> hList = new List<string>
{
{ "h1"},
{ "h2"},
{ "h3"},
{ "h4"},
{ "h5"},
{ "h6"}
};
List<string> Whitelist = new List<string>
{
{ "p"},
{ "ul"},
{ "li"},
{ "br"},
{ "b"},
{ "table"},
{ "tr"},
{ "th"},
{ "td"},
{ "strong"}
};
输入
"<head><script>alert('Hello!');</script></head><div><div><h1>Lorem ipsum </h1></div></div> <h1 style='font-size:38px;'><p style='font-size:38px;'>dolor </p></h1> sit <h1 style='font-size:38px;'><strong style='font-size:38px;'>amet</strong></h1>"
输出
"<b>Lorem ipsum</b> <b><p>dolor</p></b> sit <b><strong>amet</strong></b>"