我有一个像这样的html字符串:
<html><body><p>foo <a href='http://www.example.com'>bar</a> baz</p></body></html>
我希望删除所有html标记,以便生成的字符串变为:
foo bar baz
来自SO的其他帖子我提出了这个功能(使用Html Agility Pack):
Public Shared Function stripTags(ByVal html As String) As String
Dim plain As String = String.Empty
Dim htmldoc As New HtmlAgilityPack.HtmlDocument
htmldoc.LoadHtml(html)
Dim invalidNodes As HtmlAgilityPack.HtmlNodeCollection = htmldoc.DocumentNode.SelectNodes("//html|//body|//p|//a")
If Not htmldoc Is Nothing Then
For Each node In invalidNodes
node.ParentNode.RemoveChild(node, True)
Next
End If
Return htmldoc.DocumentNode.WriteContentTo
End Function
不幸的是,这不会返回我的预期,而是会给出:
bazbarfoo
请问,我哪里出错 - 这是最好的方法吗?
关心和快乐的编码!
更新:通过下面的答案,我想出了这个功能,可能对其他人有用:
Public Shared Function stripTags(ByVal html As String) As String
Dim htmldoc As New HtmlAgilityPack.HtmlDocument
htmldoc.LoadHtml(html.Replace("</p>", "</p>" & New String(Environment.NewLine, 2)).Replace("<br/>", Environment.NewLine))
Return htmldoc.DocumentNode.InnerText
End Function
答案 0 :(得分:32)
为什么不返回htmldoc.DocumentNode.InnerText
而不是删除所有非文本节点?它应该给你你想要的东西。
答案 1 :(得分:2)
删除白名单中未找到的标签和属性。
Public NotInheritable Class HtmlSanitizer
Private Sub New()
End Sub
Private Shared ReadOnly Whitelist As IDictionary(Of String, String())
Private Shared DeletableNodesXpath As New List(Of String)()
Shared Sub New()
Whitelist = New Dictionary(Of String, String())() From { _
{"a", New () {"href"}}, _
{"strong", Nothing}, _
{"em", Nothing}, _
{"blockquote", Nothing}, _
{"b", Nothing}, _
{"p", Nothing}, _
{"ul", Nothing}, _
{"ol", Nothing}, _
{"li", Nothing}, _
{"div", New () {"align"}}, _
{"strike", Nothing}, _
{"u", Nothing}, _
{"sub", Nothing}, _
{"sup", Nothing}, _
{"table", Nothing}, _
{"tr", Nothing}, _
{"td", Nothing}, _
{"th", Nothing} _
}
End Sub
Public Shared Function Sanitize(input As String) As String
If input.Trim().Length < 1 Then
Return String.Empty
End If
Dim htmlDocument = New HtmlDocument()
htmlDocument.LoadHtml(input)
SanitizeNode(htmlDocument.DocumentNode)
Dim xPath As String = HtmlSanitizer.CreateXPath()
Return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath)
End Function
Private Shared Sub SanitizeChildren(parentNode As HtmlNode)
For i As Integer = parentNode.ChildNodes.Count - 1 To 0 Step -1
SanitizeNode(parentNode.ChildNodes(i))
Next
End Sub
Private Shared Sub SanitizeNode(node As HtmlNode)
If node.NodeType = HtmlNodeType.Element Then
If Not Whitelist.ContainsKey(node.Name) Then
If Not DeletableNodesXpath.Contains(node.Name) Then
'DeletableNodesXpath.Add(node.Name.Replace("?",""));
node.Name = "removeableNode"
DeletableNodesXpath.Add(node.Name)
End If
If node.HasChildNodes Then
SanitizeChildren(node)
End If
Return
End If
If node.HasAttributes Then
For i As Integer = node.Attributes.Count - 1 To 0 Step -1
Dim currentAttribute As HtmlAttribute = node.Attributes(i)
Dim allowedAttributes As String() = Whitelist(node.Name)
If allowedAttributes IsNot Nothing Then
If Not allowedAttributes.Contains(currentAttribute.Name) Then
node.Attributes.Remove(currentAttribute)
End If
Else
node.Attributes.Remove(currentAttribute)
End If
Next
End If
End If
If node.HasChildNodes Then
SanitizeChildren(node)
End If
End Sub
Private Shared Function StripHtml(html As String, xPath As String) As String
Dim htmlDoc As New HtmlDocument()
htmlDoc.LoadHtml(html)
If xPath.Length > 0 Then
Dim invalidNodes As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes(xPath)
For Each node As HtmlNode In invalidNodes
node.ParentNode.RemoveChild(node, True)
Next
End If
Return htmlDoc.DocumentNode.WriteContentTo()
End Function
Private Shared Function CreateXPath() As String
Dim _xPath As String = String.Empty
For i As Integer = 0 To DeletableNodesXpath.Count - 1
If i IsNot DeletableNodesXpath.Count - 1 Then
_xPath += String.Format("//{0}|", DeletableNodesXpath(i).ToString())
Else
_xPath += String.Format("//{0}", DeletableNodesXpath(i).ToString())
End If
Next
Return _xPath
End Function
End Class
答案 2 :(得分:1)
您似乎假设ForEach从头到尾遍历文档..如果您想确保这样做,请使用常规for循环。您甚至无法确定按照您期望的xpath选择器顺序拾取节点,但您可能在这种情况下正确...
的问候, Brunis
答案 3 :(得分:0)
编辑以下几行,然后就可以得到你想要的......
Private Shared Function StripHtml(html As String, xPath As String) As String
Dim htmlDoc As New HtmlAgilityPack.HtmlDocument()
htmlDoc.LoadHtml(html)
If xPath.Length > 0 Then
Dim invalidNodes As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes(xPath)
'------- edit this line -------------------
'For Each node As HtmlNode In invalidNodes
'node.ParentNode.RemoveChild(node, True)
'Next
'
' result-> bazbarfoo
'
'------- modify line ----------------------
For i = invalidNodes.Count - 1 To 0 Step -1
Dim Node As HtmlNode = invalidNodes.Item(i)
Node.ParentNode.RemoveChild(Node, True)
Next
'
' result-> foo bar baz
'
End If
Return htmlDoc.DocumentNode.WriteContentTo()
End Function
答案 4 :(得分:-6)
您可以使用以下代码。
public string RemoveHTMLTags(string source)
{
string expn = "<.*?>";
return Regex.Replace(source, expn, string.Empty);
}