用敏捷包消毒未知数量的后代是行不通的

时间:2017-01-03 09:50:05

标签: c# html-agility-pack

以下代码的目的是能够接受来自可能包含HTML的cliënts的字符串,并删除样式,脚本,某些标记并用B标记替换H标记。

  private IDictionary<string, string[]> Whitelist;
    public vacatures PostPutVacancy(vacancy vacancy)
    {
        //List of allowed tags
        Whitelist = new Dictionary<string, string[]> {
            { "p", null },
            { "ul", null },
            { "li", null },
            { "br", null },
            { "b", null },
            { "table", null },
            { "tr", null },
            { "th", null },
            { "td", null },
            { "strong", null }
        };

        foreach (var item in vacancy.GetType().GetProperties())
        {
            if (vacancy.GetType().GetProperty(item.Name).PropertyType.FullName.Contains("String"))
            {
                var value = item.GetValue(vacancy, null);
                if (value != null)
                {
                    item.SetValue(vacancy, CallSanitizers(item.GetValue(vacancy, null)));
                    var test1 = item.GetValue(vacancy);
                }
            }
        }

        return vacancy;
    }

    private List<string> hList = new List<string>
    {
        { "h1"},
        { "h2"},
        { "h3"},
        { "h4"},
        { "h5"},
        { "h6"}
    };

    private string CallSanitizers(object obj)//==Sanitize()
    {
        string str = obj.ToString();

        if (str != HttpUtility.HtmlEncode(str))
        {
            doc.LoadHtml(str);
            SanitizeNode(doc.DocumentNode);
            string test = doc.DocumentNode.WriteTo().Trim();
            return doc.DocumentNode.WriteTo().Trim();
        }
        else
        {
            return str;
        }
    }

    private void SanitizeChildren(HtmlNode parentNode)
    {
        for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
        {
            SanitizeNode(parentNode.ChildNodes[i]);
        }
    }

    private void SanitizeNode(HtmlNode node)
    {
        if (node.NodeType == HtmlNodeType.Element)
        {
            if (!Whitelist.ContainsKey(node.Name))
            {
                if (hList.Contains(node.Name))
                {
                    HtmlNode b = doc.CreateElement("b");
                    b.InnerHtml = node.InnerHtml;
                    node.ParentNode.ReplaceChild(b, node);
                }
                else
                {
                    node.ParentNode.RemoveChild(node, true);
                }
            }

            if (node.HasAttributes)
            {
                for (int i = node.Attributes.Count - 1; i >= 0; i--)
                {
                    HtmlAttribute currentAttribute = node.Attributes[i];
                    node.Attributes.Remove(currentAttribute);
                }
            }
        }

        if (node.HasChildNodes)
        {
            SanitizeChildren(node);
        }
    }

它有效但有一个问题,子节点的子节点没有被消毒,参见示例。

输入:

"Lorem ipsum<h1 style='font-size:38px;'><p style='font-size:38px;'>dolor sit</p></h1> amet <h1 style='font-size:38px;'><strong style='font-size:38px;'>consectetur adipiscing</strong></h1>"

结果:

"Lorem ipsum<b><p style='font-size:38px;'>dolor sit</p></b> amet <b style='font-size:38px;'><strong style='font-size:38px;'>consectetur adipiscing</strong></b>"

问题必然是由于无法将孩子放回更改后的父母,因为由于标签类型的更改而不再识别父母。

有人知道如何解决这个问题吗?

如果问题不清楚或制定不当,请发表评论。

提前致谢

1 个答案:

答案 0 :(得分:0)

这解决了它

        private string CallSanitizers(string str)
    {

        if (str != HttpUtility.HtmlEncode(str))
        {
            doc.LoadHtml(str);
            str = Sanitizers();
            return doc.DocumentNode.WriteTo().Trim();
        }
        else
        {
            return str;
        }
    }

    private string Sanitizers()
    {
        doc.DocumentNode.Descendants().Where(l => l.Name == "script" || l.Name == "style").ToList().ForEach(l => l.Remove());
        doc.DocumentNode.Descendants().Where(l => hList.Contains(l.Name)).ToList().ForEach(l => l.Name = "b");
        doc.DocumentNode.Descendants().Where(l => l.Attributes != null).ToList().ForEach(l => l.Attributes.ToList().ForEach(a => a.Remove()));
        doc.DocumentNode.Descendants().Where(l => !Whitelist.Contains(l.Name) && l.NodeType == HtmlNodeType.Element).ToList().ForEach(l => l.ParentNode.RemoveChild(l, true));
        return doc.DocumentNode.OuterHtml;
    }

    //lijst van tags die worden vervangen door <b></b>
    private List<string> hList = new List<string>
    {
        { "h1"},
        { "h2"},
        { "h3"},
        { "h4"},
        { "h5"},
        { "h6"}
    };

    List<string> Whitelist = new List<string>
    {
        { "p"},
        { "ul"},
        { "li"},
        { "br"},
        { "b"},
        { "table"},
        { "tr"},
        { "th"},
        { "td"},
        { "strong"}
    };

输入

"<head><script>alert('Hello!');</script></head><div><div><h1>Lorem ipsum </h1></div></div> <h1 style='font-size:38px;'><p style='font-size:38px;'>dolor </p></h1> sit <h1 style='font-size:38px;'><strong style='font-size:38px;'>amet</strong></h1>"

输出

"<b>Lorem ipsum</b> <b><p>dolor</p></b> sit <b><strong>amet</strong></b>"